LLVM 22.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
24#include "llvm/ADT/APFloat.h"
25#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/ArrayRef.h"
27#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/StringRef.h"
33#include "llvm/ADT/Twine.h"
60#include "llvm/IR/Attributes.h"
61#include "llvm/IR/Constants.h"
62#include "llvm/IR/DataLayout.h"
63#include "llvm/IR/DebugLoc.h"
65#include "llvm/IR/Function.h"
67#include "llvm/IR/GlobalValue.h"
68#include "llvm/IR/IRBuilder.h"
69#include "llvm/IR/Instruction.h"
72#include "llvm/IR/Intrinsics.h"
73#include "llvm/IR/IntrinsicsAArch64.h"
74#include "llvm/IR/Module.h"
76#include "llvm/IR/Type.h"
77#include "llvm/IR/Use.h"
78#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
93#include <algorithm>
94#include <bitset>
95#include <cassert>
96#include <cctype>
97#include <cstdint>
98#include <cstdlib>
99#include <iterator>
100#include <limits>
101#include <optional>
102#include <tuple>
103#include <utility>
104#include <vector>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108
109#define DEBUG_TYPE "aarch64-lower"
110
111STATISTIC(NumTailCalls, "Number of tail calls");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157// TODO: This option should be removed once we switch to always using PTRADD in
158// the SelectionDAG.
160 "aarch64-use-featcpa-codegen", cl::Hidden,
161 cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in "
162 "SelectionDAG for FEAT_CPA"),
163 cl::init(false));
164
165/// Value type used for condition codes.
166constexpr MVT CondCodeVT = MVT::i32;
167
168/// Value type used for NZCV flags.
169constexpr MVT FlagsVT = MVT::i32;
170
171static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
172 AArch64::X3, AArch64::X4, AArch64::X5,
173 AArch64::X6, AArch64::X7};
174static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
175 AArch64::Q3, AArch64::Q4, AArch64::Q5,
176 AArch64::Q6, AArch64::Q7};
177
179
181
182static inline EVT getPackedSVEVectorVT(EVT VT) {
183 switch (VT.getSimpleVT().SimpleTy) {
184 default:
185 llvm_unreachable("unexpected element type for vector");
186 case MVT::i8:
187 return MVT::nxv16i8;
188 case MVT::i16:
189 return MVT::nxv8i16;
190 case MVT::i32:
191 return MVT::nxv4i32;
192 case MVT::i64:
193 return MVT::nxv2i64;
194 case MVT::f16:
195 return MVT::nxv8f16;
196 case MVT::f32:
197 return MVT::nxv4f32;
198 case MVT::f64:
199 return MVT::nxv2f64;
200 case MVT::bf16:
201 return MVT::nxv8bf16;
202 }
203}
204
205// NOTE: Currently there's only a need to return integer vector types. If this
206// changes then just add an extra "type" parameter.
208 switch (EC.getKnownMinValue()) {
209 default:
210 llvm_unreachable("unexpected element count for vector");
211 case 16:
212 return MVT::nxv16i8;
213 case 8:
214 return MVT::nxv8i16;
215 case 4:
216 return MVT::nxv4i32;
217 case 2:
218 return MVT::nxv2i64;
219 }
220}
221
223 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
224 "Expected scalable predicate vector type!");
225 switch (VT.getVectorMinNumElements()) {
226 default:
227 llvm_unreachable("unexpected element count for vector");
228 case 2:
229 return MVT::nxv2i64;
230 case 4:
231 return MVT::nxv4i32;
232 case 8:
233 return MVT::nxv8i16;
234 case 16:
235 return MVT::nxv16i8;
236 }
237}
238
239/// Returns true if VT's elements occupy the lowest bit positions of its
240/// associated register class without any intervening space.
241///
242/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
243/// same register class, but only nxv8f16 can be treated as a packed vector.
244static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
246 "Expected legal vector type!");
247 return VT.isFixedLengthVector() ||
249}
250
251// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
252// predicate and end with a passthru value matching the result type.
253static bool isMergePassthruOpcode(unsigned Opc) {
254 switch (Opc) {
255 default:
256 return false;
257 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
258 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
259 case AArch64ISD::REVH_MERGE_PASSTHRU:
260 case AArch64ISD::REVW_MERGE_PASSTHRU:
261 case AArch64ISD::REVD_MERGE_PASSTHRU:
262 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
263 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
264 case AArch64ISD::DUP_MERGE_PASSTHRU:
265 case AArch64ISD::ABS_MERGE_PASSTHRU:
266 case AArch64ISD::NEG_MERGE_PASSTHRU:
267 case AArch64ISD::FNEG_MERGE_PASSTHRU:
268 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
269 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
270 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
271 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
272 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
273 case AArch64ISD::FRINT_MERGE_PASSTHRU:
274 case AArch64ISD::FROUND_MERGE_PASSTHRU:
275 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
276 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
277 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
278 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
279 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
280 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
281 case AArch64ISD::FCVTX_MERGE_PASSTHRU:
282 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
283 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
284 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
285 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
286 case AArch64ISD::FABS_MERGE_PASSTHRU:
287 return true;
288 }
289}
290
291// Returns true if inactive lanes are known to be zeroed by construction.
293 switch (Op.getOpcode()) {
294 default:
295 return false;
296 // We guarantee i1 splat_vectors to zero the other lanes
298 case ISD::GET_ACTIVE_LANE_MASK:
299 case AArch64ISD::PTRUE:
300 case AArch64ISD::SETCC_MERGE_ZERO:
301 return true;
303 switch (Op.getConstantOperandVal(0)) {
304 default:
305 return false;
306 case Intrinsic::aarch64_sve_ptrue:
307 case Intrinsic::aarch64_sve_pnext:
308 case Intrinsic::aarch64_sve_cmpeq:
309 case Intrinsic::aarch64_sve_cmpne:
310 case Intrinsic::aarch64_sve_cmpge:
311 case Intrinsic::aarch64_sve_cmpgt:
312 case Intrinsic::aarch64_sve_cmphs:
313 case Intrinsic::aarch64_sve_cmphi:
314 case Intrinsic::aarch64_sve_cmpeq_wide:
315 case Intrinsic::aarch64_sve_cmpne_wide:
316 case Intrinsic::aarch64_sve_cmpge_wide:
317 case Intrinsic::aarch64_sve_cmpgt_wide:
318 case Intrinsic::aarch64_sve_cmplt_wide:
319 case Intrinsic::aarch64_sve_cmple_wide:
320 case Intrinsic::aarch64_sve_cmphs_wide:
321 case Intrinsic::aarch64_sve_cmphi_wide:
322 case Intrinsic::aarch64_sve_cmplo_wide:
323 case Intrinsic::aarch64_sve_cmpls_wide:
324 case Intrinsic::aarch64_sve_fcmpeq:
325 case Intrinsic::aarch64_sve_fcmpne:
326 case Intrinsic::aarch64_sve_fcmpge:
327 case Intrinsic::aarch64_sve_fcmpgt:
328 case Intrinsic::aarch64_sve_fcmpuo:
329 case Intrinsic::aarch64_sve_facgt:
330 case Intrinsic::aarch64_sve_facge:
331 case Intrinsic::aarch64_sve_whilege:
332 case Intrinsic::aarch64_sve_whilegt:
333 case Intrinsic::aarch64_sve_whilehi:
334 case Intrinsic::aarch64_sve_whilehs:
335 case Intrinsic::aarch64_sve_whilele:
336 case Intrinsic::aarch64_sve_whilelo:
337 case Intrinsic::aarch64_sve_whilels:
338 case Intrinsic::aarch64_sve_whilelt:
339 case Intrinsic::aarch64_sve_match:
340 case Intrinsic::aarch64_sve_nmatch:
341 case Intrinsic::aarch64_sve_whilege_x2:
342 case Intrinsic::aarch64_sve_whilegt_x2:
343 case Intrinsic::aarch64_sve_whilehi_x2:
344 case Intrinsic::aarch64_sve_whilehs_x2:
345 case Intrinsic::aarch64_sve_whilele_x2:
346 case Intrinsic::aarch64_sve_whilelo_x2:
347 case Intrinsic::aarch64_sve_whilels_x2:
348 case Intrinsic::aarch64_sve_whilelt_x2:
349 return true;
350 }
351 }
352}
353
354static std::tuple<SDValue, SDValue>
356 SDLoc DL(Disc);
357 SDValue AddrDisc;
358 SDValue ConstDisc;
359
360 // If this is a blend, remember the constant and address discriminators.
361 // Otherwise, it's either a constant discriminator, or a non-blended
362 // address discriminator.
363 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
364 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
365 AddrDisc = Disc->getOperand(1);
366 ConstDisc = Disc->getOperand(2);
367 } else {
368 ConstDisc = Disc;
369 }
370
371 // If the constant discriminator (either the blend RHS, or the entire
372 // discriminator value) isn't a 16-bit constant, bail out, and let the
373 // discriminator be computed separately.
374 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
375 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
376 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
377
378 // If there's no address discriminator, use NoRegister, which we'll later
379 // replace with XZR, or directly use a Z variant of the inst. when available.
380 if (!AddrDisc)
381 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
382
383 return std::make_tuple(
384 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
385 AddrDisc);
386}
387
389 const AArch64Subtarget &STI)
390 : TargetLowering(TM), Subtarget(&STI) {
391 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
392 // we have to make something up. Arbitrarily, choose ZeroOrOne.
394 // When comparing vectors the result sets the different elements in the
395 // vector to all-one or all-zero.
397
398 // Set up the register classes.
399 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
400 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
401
402 if (Subtarget->hasLS64()) {
403 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
404 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
405 setOperationAction(ISD::STORE, MVT::i64x8, Custom);
406 }
407
408 if (Subtarget->hasFPARMv8()) {
409 addRegisterClass(MVT::aarch64mfp8, &AArch64::FPR8RegClass);
410 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
411 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
412 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
413 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
414 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
415 }
416
417 if (Subtarget->hasNEON()) {
418 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
419 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
420
421 addDRType(MVT::v2f32);
422 addDRType(MVT::v8i8);
423 addDRType(MVT::v4i16);
424 addDRType(MVT::v2i32);
425 addDRType(MVT::v1i64);
426 addDRType(MVT::v1f64);
427 addDRType(MVT::v4f16);
428 addDRType(MVT::v4bf16);
429
430 addQRType(MVT::v4f32);
431 addQRType(MVT::v2f64);
432 addQRType(MVT::v16i8);
433 addQRType(MVT::v8i16);
434 addQRType(MVT::v4i32);
435 addQRType(MVT::v2i64);
436 addQRType(MVT::v8f16);
437 addQRType(MVT::v8bf16);
438 }
439
440 if (Subtarget->isSVEorStreamingSVEAvailable()) {
441 // Add legal sve predicate types
442 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
443 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
444 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
445 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
446 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
447
448 // Add legal sve data types
449 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
450 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
451 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
452 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
453
454 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
455 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
456 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
457 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
458 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
459 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
460
461 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
462 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
463 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
464
465 if (Subtarget->useSVEForFixedLengthVectors()) {
468 addRegisterClass(VT, &AArch64::ZPRRegClass);
469
472 addRegisterClass(VT, &AArch64::ZPRRegClass);
473 }
474 }
475
476 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
477 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
478 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
479 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
480
481 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
482 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
483 }
484
485 // Compute derived properties from the register classes
486 computeRegisterProperties(Subtarget->getRegisterInfo());
487
488 // Provide all sorts of operation actions
506 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
507 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
508 setOperationAction(ISD::BR_CC, MVT::i64, Custom);
509 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
510 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
511 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
514 if (Subtarget->hasFPARMv8()) {
517 }
526 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
528 setOperationAction(ISD::BRIND, MVT::Other, Custom);
530
532
536
540
542
543 // Custom lowering hooks are needed for XOR
544 // to fold it into CSINC/CSINV.
547
548 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
549 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
550
551 // Virtually no operation on f128 is legal, but LLVM can't expand them when
552 // there's a valid register class, so we need custom operations in most cases.
553 setOperationAction(ISD::FABS, MVT::f128, Expand);
556 setOperationAction(ISD::FCOS, MVT::f128, Expand);
560 setOperationAction(ISD::FNEG, MVT::f128, Expand);
561 setOperationAction(ISD::FPOW, MVT::f128, Expand);
563 setOperationAction(ISD::FRINT, MVT::f128, Expand);
564 setOperationAction(ISD::FSIN, MVT::f128, Expand);
565 setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
566 setOperationAction(ISD::FSQRT, MVT::f128, Expand);
568 setOperationAction(ISD::FTAN, MVT::f128, Expand);
569 setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
573 setOperationAction(ISD::BR_CC, MVT::f128, Custom);
576 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
577 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
578 // aren't handled.
579
580 // Lowering for many of the conversions is actually specified by the non-f128
581 // type. The LowerXXX function will be trivial when f128 isn't involved.
606 if (Subtarget->hasFPARMv8()) {
609 }
612 if (Subtarget->hasFPARMv8()) {
615 }
618
623
624 // Variable arguments.
625 setOperationAction(ISD::VASTART, MVT::Other, Custom);
626 setOperationAction(ISD::VAARG, MVT::Other, Custom);
627 setOperationAction(ISD::VACOPY, MVT::Other, Custom);
628 setOperationAction(ISD::VAEND, MVT::Other, Expand);
629
630 // Variable-sized objects.
631 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
632 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
633
634 // Lowering Funnel Shifts to EXTR
639
640 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
641
642 // Constant pool entries
644
645 // BlockAddress
647
648 // AArch64 lacks both left-rotate and popcount instructions.
654 }
655
656 // AArch64 doesn't have i32 MULH{S|U}.
659
660 // AArch64 doesn't have {U|S}MUL_LOHI.
665
666 if (Subtarget->hasCSSC()) {
670
672
676
679
684
689 } else {
693
696
699 }
700
706 }
713
714 // Custom lower Add/Sub/Mul with overflow.
727
736
737 setOperationAction(ISD::FSIN, MVT::f32, Expand);
738 setOperationAction(ISD::FSIN, MVT::f64, Expand);
739 setOperationAction(ISD::FCOS, MVT::f32, Expand);
740 setOperationAction(ISD::FCOS, MVT::f64, Expand);
741 setOperationAction(ISD::FPOW, MVT::f32, Expand);
742 setOperationAction(ISD::FPOW, MVT::f64, Expand);
745 if (Subtarget->hasFullFP16()) {
748 } else {
751 }
752
753 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
754 ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
755 ISD::FSINCOSPI, ISD::FMODF, ISD::FACOS,
756 ISD::FASIN, ISD::FATAN, ISD::FATAN2,
757 ISD::FCOSH, ISD::FSINH, ISD::FTANH,
758 ISD::FTAN, ISD::FEXP, ISD::FEXP2,
759 ISD::FEXP10, ISD::FLOG, ISD::FLOG2,
767 setOperationAction(Op, MVT::f16, Promote);
768 setOperationAction(Op, MVT::v4f16, Expand);
769 setOperationAction(Op, MVT::v8f16, Expand);
770 setOperationAction(Op, MVT::bf16, Promote);
771 setOperationAction(Op, MVT::v4bf16, Expand);
772 setOperationAction(Op, MVT::v8bf16, Expand);
773 }
774
775 // Legalize fcanonicalize to circumvent default expansion
776 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
777 if (Subtarget->hasFullFP16()) {
779 }
780
781 // fpextend from f16 or bf16 to f32 is legal
782 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
783 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Legal);
786 // fpextend from bf16 to f64 needs to be split into two fpextends
787 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
789
790 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
791 for (auto Op : {
794 ISD::BR_CC,
795 ISD::FADD,
796 ISD::FSUB,
797 ISD::FMUL,
798 ISD::FDIV,
799 ISD::FMA,
800 ISD::FCEIL,
801 ISD::FSQRT,
802 ISD::FFLOOR,
803 ISD::FNEARBYINT,
804 ISD::FRINT,
805 ISD::FROUND,
806 ISD::FROUNDEVEN,
807 ISD::FTRUNC,
808 ISD::FMINNUM,
809 ISD::FMAXNUM,
810 ISD::FMINIMUM,
811 ISD::FMAXIMUM,
812 ISD::FMINIMUMNUM,
813 ISD::FMAXIMUMNUM,
832 })
833 setOperationAction(Op, ScalarVT, Promote);
834
835 for (auto Op : {ISD::FNEG, ISD::FABS})
836 setOperationAction(Op, ScalarVT, Legal);
837
838 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
839 // because the result type is integer.
840 for (auto Op : {ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
843 setOperationAction(Op, ScalarVT, Custom);
844
845 // promote v4f16 to v4f32 when that is known to be safe.
846 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
847 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
848 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
849 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
850 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
851 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
852 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
853 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
854 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
855 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
856 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
857 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
858 setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
859 setOperationPromotedToType(ISD::SETCC, V4Narrow, MVT::v4f32);
860
861 setOperationAction(ISD::FABS, V4Narrow, Legal);
862 setOperationAction(ISD::FNEG, V4Narrow, Legal);
864 setOperationAction(ISD::BR_CC, V4Narrow, Expand);
868 setOperationAction(ISD::FSQRT, V4Narrow, Expand);
869
870 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
871 setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
872 setOperationPromotedToType(ISD::SETCC, V8Narrow, MVT::v8f32);
873
874 setOperationAction(ISD::FABS, V8Narrow, Legal);
876 setOperationAction(ISD::FCEIL, V8Narrow, Legal);
879 setOperationAction(ISD::FFLOOR, V8Narrow, Legal);
882 setOperationAction(ISD::FNEARBYINT, V8Narrow, Legal);
883 setOperationAction(ISD::FNEG, V8Narrow, Legal);
884 setOperationAction(ISD::FROUND, V8Narrow, Legal);
885 setOperationAction(ISD::FROUNDEVEN, V8Narrow, Legal);
886 setOperationAction(ISD::FRINT, V8Narrow, Legal);
887 setOperationAction(ISD::FSQRT, V8Narrow, Expand);
889 setOperationAction(ISD::FTRUNC, V8Narrow, Legal);
890 setOperationAction(ISD::BR_CC, V8Narrow, Expand);
893 setOperationAction(ISD::FP_EXTEND, V8Narrow, Expand);
894 };
895
896 if (!Subtarget->hasFullFP16()) {
897 LegalizeNarrowFP(MVT::f16);
898 }
899 LegalizeNarrowFP(MVT::bf16);
902
903 // AArch64 has implementations of a lot of rounding-like FP operations.
904 // clang-format off
905 for (auto Op :
906 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
907 ISD::FRINT, ISD::FTRUNC, ISD::FROUND,
908 ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM,
909 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND,
910 ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
911 ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE,
917 for (MVT Ty : {MVT::f32, MVT::f64})
919 if (Subtarget->hasFullFP16())
920 setOperationAction(Op, MVT::f16, Legal);
921 }
922 // clang-format on
923
924 // Basic strict FP operations are legal
927 for (MVT Ty : {MVT::f32, MVT::f64})
929 if (Subtarget->hasFullFP16())
930 setOperationAction(Op, MVT::f16, Legal);
931 }
932
933 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
934
936 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
937 setOperationAction(ISD::GET_FPMODE, MVT::i32, Custom);
938 setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);
939 setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
940
941 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
942 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
943 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);
944 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, LibCall);
945 } else {
946 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
947 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Expand);
948 }
949 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
950 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
951
952 // Generate outline atomics library calls only if LSE was not specified for
953 // subtarget
954 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
955 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
956 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
957 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
958 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
959 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
960 setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
961 setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
962 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
963 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
964 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
965 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
966 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
967 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
968 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
969 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
970 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
971 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
972 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
973 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
974 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
975 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
976 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
977 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
978 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
979 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
980 }
981
982 if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {
983 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f16, LibCall);
984 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f32, LibCall);
985 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f64, LibCall);
986 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::bf16, LibCall);
987
988 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f16, LibCall);
989 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f32, LibCall);
990 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f64, LibCall);
991 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::bf16, LibCall);
992
993 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f16, LibCall);
994 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f32, LibCall);
995 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f64, LibCall);
996 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::bf16, LibCall);
997
998 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f16, LibCall);
999 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f32, LibCall);
1000 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f64, LibCall);
1001 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::bf16, LibCall);
1002
1003 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f16, LibCall);
1004 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f32, LibCall);
1005 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f64, LibCall);
1006 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::bf16, LibCall);
1007 }
1008
1009 if (Subtarget->hasLSE128()) {
1010 // Custom lowering because i128 is not legal. Must be replaced by 2x64
1011 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
1012 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom);
1013 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom);
1014 setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom);
1015 }
1016
1017 // 128-bit loads and stores can be done without expanding
1018 setOperationAction(ISD::LOAD, MVT::i128, Custom);
1019 setOperationAction(ISD::STORE, MVT::i128, Custom);
1020
1021 // Aligned 128-bit loads and stores are single-copy atomic according to the
1022 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
1023 if (Subtarget->hasLSE2()) {
1024 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
1025 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
1026 }
1027
1028 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
1029 // custom lowering, as there are no un-paired non-temporal stores and
1030 // legalization will break up 256 bit inputs.
1031 setOperationAction(ISD::STORE, MVT::v32i8, Custom);
1032 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
1033 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
1034 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1035 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
1036 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
1037 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
1038 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
1039
1040 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1041 // custom lowering, as there are no un-paired non-temporal loads legalization
1042 // will break up 256 bit inputs.
1043 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1044 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1045 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1046 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1047 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1048 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1049 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1050 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1051
1052 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1053 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
1054
1055 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1056 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1057 // Issue __sincos_stret if available.
1058 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1059 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1060 } else {
1061 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1062 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1063 }
1064
1065 // Make floating-point constants legal for the large code model, so they don't
1066 // become loads from the constant pool.
1067 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1070 }
1071
1072 // AArch64 does not have floating-point extending loads, i1 sign-extending
1073 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1074 for (MVT VT : MVT::fp_valuetypes()) {
1075 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1076 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1077 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1078 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1079 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1080 }
1081 for (MVT VT : MVT::integer_valuetypes())
1082 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1083
1084 for (MVT WideVT : MVT::fp_valuetypes()) {
1085 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1086 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1087 setTruncStoreAction(WideVT, NarrowVT, Expand);
1088 }
1089 }
1090 }
1091
1092 if (Subtarget->hasFPARMv8()) {
1093 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
1094 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
1095 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
1096 }
1097
1098 // Indexed loads and stores are supported.
1099 for (unsigned im = (unsigned)ISD::PRE_INC;
1101 setIndexedLoadAction(im, MVT::i8, Legal);
1102 setIndexedLoadAction(im, MVT::i16, Legal);
1103 setIndexedLoadAction(im, MVT::i32, Legal);
1104 setIndexedLoadAction(im, MVT::i64, Legal);
1105 setIndexedLoadAction(im, MVT::f64, Legal);
1106 setIndexedLoadAction(im, MVT::f32, Legal);
1107 setIndexedLoadAction(im, MVT::f16, Legal);
1108 setIndexedLoadAction(im, MVT::bf16, Legal);
1109 setIndexedStoreAction(im, MVT::i8, Legal);
1110 setIndexedStoreAction(im, MVT::i16, Legal);
1111 setIndexedStoreAction(im, MVT::i32, Legal);
1112 setIndexedStoreAction(im, MVT::i64, Legal);
1113 setIndexedStoreAction(im, MVT::f64, Legal);
1114 setIndexedStoreAction(im, MVT::f32, Legal);
1115 setIndexedStoreAction(im, MVT::f16, Legal);
1116 setIndexedStoreAction(im, MVT::bf16, Legal);
1117 }
1118
1119 // Trap.
1120 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1121 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1122 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
1123
1124 // We combine OR nodes for ccmp operations.
1126 // Try to create BICs for vector ANDs.
1128
1129 // llvm.init.trampoline and llvm.adjust.trampoline
1130 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
1131 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
1132
1133 // Vector add and sub nodes may conceal a high-half opportunity.
1134 // Also, try to fold ADD into CSINC/CSINV..
1137
1140
1141 // Try and combine setcc with csel
1143
1145
1149 ISD::STORE, ISD::BUILD_VECTOR});
1152 setTargetDAGCombine(ISD::LOAD);
1153
1154 setTargetDAGCombine(ISD::MSTORE);
1155
1157
1159
1162 ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
1163
1165 {ISD::MGATHER, ISD::MSCATTER, ISD::EXPERIMENTAL_VECTOR_HISTOGRAM});
1166
1167 setTargetDAGCombine(ISD::FP_EXTEND);
1168
1170
1172
1173 setTargetDAGCombine(ISD::GET_ACTIVE_LANE_MASK);
1174
1175 setTargetDAGCombine(ISD::VECREDUCE_AND);
1176 setTargetDAGCombine(ISD::VECREDUCE_OR);
1177 setTargetDAGCombine(ISD::VECREDUCE_XOR);
1178
1180
1183
1184 // In case of strict alignment, avoid an excessive number of byte wide stores.
1187 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1188
1192 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1193
1196 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1197
1200 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1201
1203
1205
1206 EnableExtLdPromotion = true;
1207
1208 // Set required alignment.
1210 // Set preferred alignments.
1211
1212 // Don't align loops on Windows. The SEH unwind info generation needs to
1213 // know the exact length of functions before the alignments have been
1214 // expanded.
1215 if (!Subtarget->isTargetWindows())
1219
1220 // Only change the limit for entries in a jump table if specified by
1221 // the sub target, but not at the command line.
1222 unsigned MaxJT = STI.getMaximumJumpTableSize();
1223 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1225
1227
1229
1231 if (Subtarget->hasSME())
1233
1234 if (Subtarget->isNeonAvailable()) {
1235 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1236 // silliness like this:
1237 // clang-format off
1238 for (auto Op :
1239 {ISD::SELECT, ISD::SELECT_CC, ISD::FATAN2,
1240 ISD::BR_CC, ISD::FADD, ISD::FSUB,
1242 ISD::FNEG, ISD::FABS, ISD::FCEIL,
1243 ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
1244 ISD::FSIN, ISD::FCOS, ISD::FTAN,
1245 ISD::FASIN, ISD::FACOS, ISD::FATAN,
1246 ISD::FSINH, ISD::FCOSH, ISD::FTANH,
1247 ISD::FPOW, ISD::FLOG, ISD::FLOG2,
1248 ISD::FLOG10, ISD::FEXP, ISD::FEXP2,
1249 ISD::FEXP10, ISD::FRINT, ISD::FROUND,
1250 ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM,
1251 ISD::FMAXNUM, ISD::FMINIMUM, ISD::FMAXIMUM,
1252 ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
1259 setOperationAction(Op, MVT::v1f64, Expand);
1260 // clang-format on
1261
1262 for (auto Op :
1267 setOperationAction(Op, MVT::v1i64, Expand);
1268
1269 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1270 // elements smaller than i32, so promote the input to i32 first.
1271 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1272 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1273
1274 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1275 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1276 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1279 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1281
1282 if (Subtarget->hasFullFP16()) {
1285
1294 } else {
1295 // when AArch64 doesn't have fullfp16 support, promote the input
1296 // to i32 first.
1297 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1298 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1299 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1300 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1301 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1302 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1303 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1304 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1305 }
1306
1307 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1308 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1315 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1320 }
1321
1322 // Custom handling for some quad-vector types to detect MULL.
1323 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1324 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1325 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1326 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1327 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1328 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1329
1330 // Saturates
1331 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64,
1332 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1337 }
1338
1339 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1340 MVT::v4i32}) {
1347 }
1348
1349 // Vector reductions
1350 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1351 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1352 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1353 setOperationAction(ISD::VECREDUCE_FMAX, VT, Legal);
1354 setOperationAction(ISD::VECREDUCE_FMIN, VT, Legal);
1355 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Legal);
1356 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Legal);
1357
1358 setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
1359 }
1360 }
1361 if (Subtarget->hasFullFP16())
1362 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
1363
1364 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1365 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1366 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1367 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1368 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1369 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1370 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1371 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1372 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1373 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1374 }
1375 setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
1376 setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Custom);
1377 setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Custom);
1378 setOperationAction(ISD::VECREDUCE_XOR, MVT::v2i64, Custom);
1379
1381 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1382 // Likewise, narrowing and extending vector loads/stores aren't handled
1383 // directly.
1386
1387 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1390 } else {
1393 }
1396
1399
1400 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1401 setTruncStoreAction(VT, InnerVT, Expand);
1402 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1403 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1404 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1405 }
1406 }
1407
1408 for (auto Op :
1409 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
1410 ISD::FROUND, ISD::FROUNDEVEN, ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
1414 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1416 if (Subtarget->hasFullFP16())
1417 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1419 }
1420
1421 // LRINT and LLRINT.
1422 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1423 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1425 if (Subtarget->hasFullFP16())
1426 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1428 }
1429
1430 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1431
1432 setOperationAction(ISD::BITCAST, MVT::i2, Custom);
1433 setOperationAction(ISD::BITCAST, MVT::i4, Custom);
1434 setOperationAction(ISD::BITCAST, MVT::i8, Custom);
1435 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
1436
1437 setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
1438 setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
1439 setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
1440
1441 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1442 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1443 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1444 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1445 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1446 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1447
1448 // ADDP custom lowering
1449 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1451 // FADDP custom lowering
1452 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1454
1455 if (Subtarget->hasDotProd()) {
1456 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1457 ISD::PARTIAL_REDUCE_UMLA};
1458
1459 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal);
1460 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal);
1461 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1462
1463 if (Subtarget->hasMatMulInt8()) {
1464 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v4i32,
1465 MVT::v16i8, Legal);
1466 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i64,
1467 MVT::v16i8, Custom);
1468
1469 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i32,
1470 MVT::v8i8, Legal);
1471 }
1472 }
1473
1474 } else /* !isNeonAvailable */ {
1476 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1478
1479 if (VT.is128BitVector() || VT.is64BitVector()) {
1480 setOperationAction(ISD::LOAD, VT, Legal);
1481 setOperationAction(ISD::STORE, VT, Legal);
1482 setOperationAction(ISD::BITCAST, VT,
1483 Subtarget->isLittleEndian() ? Legal : Expand);
1484 }
1485 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1486 setTruncStoreAction(VT, InnerVT, Expand);
1487 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1488 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1489 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1490 }
1491 }
1492 }
1493
1494 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1498 }
1499
1500 if (Subtarget->hasSME()) {
1502 }
1503
1504 // FIXME: Move lowering for more nodes here if those are common between
1505 // SVE and SME.
1506 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1507 for (auto VT :
1508 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1513 }
1514 for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1515 setOperationAction(ISD::VECTOR_FIND_LAST_ACTIVE, VT, Legal);
1516 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Legal);
1517 }
1518
1519 if (Subtarget->hasSVE2p1() ||
1520 (Subtarget->hasSME2() && Subtarget->isStreaming()))
1521 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, MVT::nxv32i1, Custom);
1522
1523 for (auto VT : {MVT::v16i8, MVT::v8i8, MVT::v4i16, MVT::v2i32})
1524 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Custom);
1525 }
1526
1527 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1528 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1539 setOperationAction(ISD::MLOAD, VT, Custom);
1559 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1560 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1561 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1562 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1563 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1564 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1565 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1566 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1569
1575
1584
1589
1590 if (!Subtarget->isLittleEndian())
1591 setOperationAction(ISD::BITCAST, VT, Custom);
1592
1593 if (Subtarget->hasSVE2() ||
1594 (Subtarget->hasSME() && Subtarget->isStreaming()))
1595 // For SLI/SRI.
1597 }
1598
1599 // Illegal unpacked integer vector types.
1600 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1603 }
1604
1605 // Type legalize unpacked bitcasts.
1606 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1607 setOperationAction(ISD::BITCAST, VT, Custom);
1608
1609 for (auto VT :
1610 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1611 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1613
1614 for (auto VT :
1615 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1620 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1621 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1622 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1623
1627
1628 // There are no legal MVT::nxv16f## based types.
1629 if (VT != MVT::nxv16i1) {
1634 }
1635 }
1636
1637 // NEON doesn't support masked loads/stores, but SME and SVE do.
1638 for (auto VT :
1639 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1640 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1641 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1642 setOperationAction(ISD::MLOAD, VT, Custom);
1643 setOperationAction(ISD::MSTORE, VT, Custom);
1644 }
1645
1646 // Firstly, exclude all scalable vector extending loads/truncating stores,
1647 // include both integer and floating scalable vector.
1649 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1650 setTruncStoreAction(VT, InnerVT, Expand);
1651 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1652 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1653 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1654 }
1655 }
1656
1657 // Then, selectively enable those which we directly support.
1658 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1659 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1660 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1661 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1662 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1663 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1664 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1665 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1666 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1667 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1668 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1669 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1670 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1671 }
1672
1673 // SVE supports truncating stores of 64 and 128-bit vectors
1674 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1675 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1676 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1677 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1678 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1679
1680 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1681 MVT::nxv4f32, MVT::nxv2f64}) {
1682 setOperationAction(ISD::BITCAST, VT, Custom);
1685 setOperationAction(ISD::MLOAD, VT, Custom);
1693 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1694 setOperationAction(ISD::FMAXNUM, VT, Custom);
1695 setOperationAction(ISD::FMINIMUM, VT, Custom);
1696 setOperationAction(ISD::FMINNUM, VT, Custom);
1698 setOperationAction(ISD::FNEG, VT, Custom);
1700 setOperationAction(ISD::FCEIL, VT, Custom);
1701 setOperationAction(ISD::FFLOOR, VT, Custom);
1702 setOperationAction(ISD::FNEARBYINT, VT, Custom);
1703 setOperationAction(ISD::FRINT, VT, Custom);
1704 setOperationAction(ISD::LRINT, VT, Custom);
1705 setOperationAction(ISD::LLRINT, VT, Custom);
1706 setOperationAction(ISD::FROUND, VT, Custom);
1707 setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1708 setOperationAction(ISD::FTRUNC, VT, Custom);
1709 setOperationAction(ISD::FSQRT, VT, Custom);
1710 setOperationAction(ISD::FABS, VT, Custom);
1711 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1713 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1714 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1715 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1716 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
1717 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
1721
1724 setOperationAction(ISD::FPOW, VT, Expand);
1725 setOperationAction(ISD::FPOWI, VT, Expand);
1726 setOperationAction(ISD::FCOS, VT, Expand);
1727 setOperationAction(ISD::FSIN, VT, Expand);
1728 setOperationAction(ISD::FSINCOS, VT, Expand);
1729 setOperationAction(ISD::FTAN, VT, Expand);
1730 setOperationAction(ISD::FACOS, VT, Expand);
1731 setOperationAction(ISD::FASIN, VT, Expand);
1732 setOperationAction(ISD::FATAN, VT, Expand);
1733 setOperationAction(ISD::FATAN2, VT, Expand);
1734 setOperationAction(ISD::FCOSH, VT, Expand);
1735 setOperationAction(ISD::FSINH, VT, Expand);
1736 setOperationAction(ISD::FTANH, VT, Expand);
1737 setOperationAction(ISD::FEXP, VT, Expand);
1738 setOperationAction(ISD::FEXP2, VT, Expand);
1739 setOperationAction(ISD::FEXP10, VT, Expand);
1740 setOperationAction(ISD::FLOG, VT, Expand);
1741 setOperationAction(ISD::FLOG2, VT, Expand);
1742 setOperationAction(ISD::FLOG10, VT, Expand);
1743
1755 }
1756
1757 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1758 setOperationAction(ISD::BITCAST, VT, Custom);
1760 setOperationAction(ISD::FABS, VT, Custom);
1762 setOperationAction(ISD::FNEG, VT, Custom);
1763 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1765 setOperationAction(ISD::MLOAD, VT, Custom);
1773
1774 if (Subtarget->hasSVEB16B16() &&
1775 Subtarget->isNonStreamingSVEorSME2Available()) {
1778 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1779 setOperationAction(ISD::FMAXNUM, VT, Custom);
1780 setOperationAction(ISD::FMINIMUM, VT, Custom);
1781 setOperationAction(ISD::FMINNUM, VT, Custom);
1784 }
1785 }
1786
1787 for (auto Opcode :
1788 {ISD::FCEIL, ISD::FDIV, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
1789 ISD::FROUND, ISD::FROUNDEVEN, ISD::FSQRT, ISD::FTRUNC, ISD::SETCC,
1790 ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMAXIMUM,
1791 ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMINIMUM}) {
1792 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1793 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1794 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1795 }
1796
1797 if (!Subtarget->hasSVEB16B16() ||
1798 !Subtarget->isNonStreamingSVEorSME2Available()) {
1799 for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM,
1800 ISD::FMINIMUM, ISD::FMINNUM, ISD::FMUL, ISD::FSUB}) {
1801 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1802 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1803 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1804 }
1805 }
1806
1809
1810 // NEON doesn't support integer divides, but SVE does
1811 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1812 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1815 }
1816
1817 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1818 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1819 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1820
1821 // NOTE: Currently this has to happen after computeRegisterProperties rather
1822 // than the preferred option of combining it with the addRegisterClass call.
1823 if (Subtarget->useSVEForFixedLengthVectors()) {
1826 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1827 addTypeForFixedLengthSVE(VT);
1828 }
1831 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1832 addTypeForFixedLengthSVE(VT);
1833 }
1834
1835 // 64bit results can mean a bigger than NEON input.
1836 for (auto VT : {MVT::v8i8, MVT::v4i16})
1839
1840 // 128bit results imply a bigger than NEON input.
1841 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1843 for (auto VT : {MVT::v8f16, MVT::v4f32})
1845
1846 // These operations are not supported on NEON but SVE can do them.
1848 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1849 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1850 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1851 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1852 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1853 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1854 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1855 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1856 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1857 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1858 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1859 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1860 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1861 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1862 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1863 setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
1864 setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
1865 setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
1866 setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
1867
1868 // Int operations with no NEON support.
1869 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1870 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1873 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1874 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1875 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1878 }
1879
1880 // Use SVE for vectors with more than 2 elements.
1881 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1882 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1883 }
1884
1885 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1886 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1887 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1888 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1889
1890 setOperationAction(ISD::VSCALE, MVT::i32, Custom);
1891
1892 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1894 }
1895
1896 // Handle partial reduction operations
1897 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1898 // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
1899 // Other pairs will default to 'Expand'.
1900 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1901 ISD::PARTIAL_REDUCE_UMLA};
1902 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv8i16, Legal);
1903 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv16i8, Legal);
1904
1905 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv16i8, Custom);
1906
1907 if (Subtarget->hasMatMulInt8()) {
1908 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::nxv4i32,
1909 MVT::nxv16i8, Legal);
1910 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::nxv2i64,
1911 MVT::nxv16i8, Custom);
1912 }
1913
1914 // Wide add types
1915 if (Subtarget->hasSVE2() || Subtarget->hasSME()) {
1916 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv4i32, Legal);
1917 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);
1918 setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);
1919 }
1920 }
1921
1922 // Handle non-aliasing elements mask
1923 if (Subtarget->hasSVE2() ||
1924 (Subtarget->hasSME() && Subtarget->isStreaming())) {
1925 // FIXME: Support wider fixed-length types when msve-vector-bits is used.
1926 for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) {
1929 }
1930 for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) {
1933 }
1934 }
1935
1936 // Handle operations that are only available in non-streaming SVE mode.
1937 if (Subtarget->isSVEAvailable()) {
1938 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1939 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1940 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1941 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1942 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1943 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1944 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1945 setOperationAction(ISD::MGATHER, VT, Custom);
1946 setOperationAction(ISD::MSCATTER, VT, Custom);
1947 }
1948
1949 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1950 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1951 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1952 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1953
1954 // We can lower types that have <vscale x {2|4}> elements to compact.
1955 for (auto VT :
1956 {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
1957 MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
1959
1960 // If we have SVE, we can use SVE logic for legal (or smaller than legal)
1961 // NEON vectors in the lowest bits of the SVE register.
1962 for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
1963 MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
1965
1966 // Histcnt is SVE2 only
1967 if (Subtarget->hasSVE2()) {
1968 setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv4i32,
1969 Custom);
1970 setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv2i64,
1971 Custom);
1972
1973 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1974 ISD::PARTIAL_REDUCE_UMLA};
1975 // Must be lowered to SVE instructions.
1976 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
1977 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
1978 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1979 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
1980 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
1981 setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
1982 }
1983 }
1984
1985 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1986 // Only required for llvm.aarch64.mops.memset.tag
1988 }
1989
1991
1992 if (Subtarget->hasSVE()) {
1993 setOperationAction(ISD::FLDEXP, MVT::f64, Custom);
1994 setOperationAction(ISD::FLDEXP, MVT::f32, Custom);
1995 setOperationAction(ISD::FLDEXP, MVT::f16, Custom);
1996 setOperationAction(ISD::FLDEXP, MVT::bf16, Custom);
1997 }
1998
1999 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
2000
2001 IsStrictFPEnabled = true;
2003
2004 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2005 // it, but it's just a wrapper around ldexp.
2006 if (Subtarget->isTargetWindows()) {
2007 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2008 if (isOperationExpand(Op, MVT::f32))
2009 setOperationAction(Op, MVT::f32, Promote);
2010 }
2011
2012 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
2013 // isn't legal.
2014 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2015 if (isOperationExpand(Op, MVT::f16))
2016 setOperationAction(Op, MVT::f16, Promote);
2017}
2018
2020 return static_cast<const AArch64TargetMachine &>(getTargetMachine());
2021}
2022
2023void AArch64TargetLowering::addTypeForNEON(MVT VT) {
2024 assert(VT.isVector() && "VT should be a vector type");
2025
2026 if (VT.isFloatingPoint()) {
2028 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
2029 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
2030 }
2031
2032 // Mark vector float intrinsics as expand.
2033 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
2034 setOperationAction(ISD::FSIN, VT, Expand);
2035 setOperationAction(ISD::FCOS, VT, Expand);
2036 setOperationAction(ISD::FTAN, VT, Expand);
2037 setOperationAction(ISD::FASIN, VT, Expand);
2038 setOperationAction(ISD::FACOS, VT, Expand);
2039 setOperationAction(ISD::FATAN, VT, Expand);
2040 setOperationAction(ISD::FATAN2, VT, Expand);
2041 setOperationAction(ISD::FSINH, VT, Expand);
2042 setOperationAction(ISD::FCOSH, VT, Expand);
2043 setOperationAction(ISD::FTANH, VT, Expand);
2044 setOperationAction(ISD::FPOW, VT, Expand);
2045 setOperationAction(ISD::FLOG, VT, Expand);
2046 setOperationAction(ISD::FLOG2, VT, Expand);
2047 setOperationAction(ISD::FLOG10, VT, Expand);
2048 setOperationAction(ISD::FEXP, VT, Expand);
2049 setOperationAction(ISD::FEXP2, VT, Expand);
2050 setOperationAction(ISD::FEXP10, VT, Expand);
2051 }
2052
2053 // But we do support custom-lowering for FCOPYSIGN.
2054 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
2055 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
2056 VT == MVT::v8f16) &&
2057 Subtarget->hasFullFP16()))
2059
2072
2076 for (MVT InnerVT : MVT::all_valuetypes())
2077 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
2078
2079 // CNT supports only B element sizes, then use UADDLP to widen.
2080 if (VT != MVT::v8i8 && VT != MVT::v16i8)
2082
2088
2089 for (unsigned Opcode :
2092 setOperationAction(Opcode, VT, Custom);
2093
2094 if (!VT.isFloatingPoint())
2096
2097 // [SU][MIN|MAX] are available for all NEON types apart from i64.
2098 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
2099 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
2100 setOperationAction(Opcode, VT, Legal);
2101
2102 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
2103 // NEON types.
2104 if (VT.isFloatingPoint() &&
2105 VT.getVectorElementType() != MVT::bf16 &&
2106 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
2107 for (unsigned Opcode :
2108 {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
2109 ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::STRICT_FMINIMUM,
2113 setOperationAction(Opcode, VT, Legal);
2114
2115 // Strict fp extend and trunc are legal
2116 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
2118 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
2120
2121 // FIXME: We could potentially make use of the vector comparison instructions
2122 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
2123 // complications:
2124 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
2125 // so we would need to expand when the condition code doesn't match the
2126 // kind of comparison.
2127 // * Some kinds of comparison require more than one FCMXY instruction so
2128 // would need to be expanded instead.
2129 // * The lowering of the non-strict versions involves target-specific ISD
2130 // nodes so we would likely need to add strict versions of all of them and
2131 // handle them appropriately.
2134
2135 // When little-endian we can use ordinary d and q register loads/stores for
2136 // vector types, but when big-endian we need to use structure load/store which
2137 // only allow post-index addressing.
2138 if (Subtarget->isLittleEndian()) {
2139 for (unsigned im = (unsigned)ISD::PRE_INC;
2140 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
2143 }
2144 } else {
2147 }
2148
2149 if (Subtarget->hasD128()) {
2152 }
2153
2154 if (VT.isInteger()) {
2155 // Let common code emit inverted variants of compares we do support.
2161 }
2162}
2163
2165 EVT OpVT) const {
2166 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2167 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
2168 ResVT.getVectorElementType() != MVT::i1)
2169 return true;
2170
2171 // Only support illegal types if the result is scalable and min elements > 1.
2172 if (ResVT.getVectorMinNumElements() == 1 ||
2173 (ResVT.isFixedLengthVector() && (ResVT.getVectorNumElements() > 16 ||
2174 (OpVT != MVT::i32 && OpVT != MVT::i64))))
2175 return true;
2176
2177 // 32 & 64 bit operands are supported. We can promote anything < 64 bits,
2178 // but anything larger should be expanded.
2179 if (OpVT.getFixedSizeInBits() > 64)
2180 return true;
2181
2182 return false;
2183}
2184
2186 if (!Subtarget->isSVEorStreamingSVEAvailable())
2187 return true;
2188
2189 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2190 // also support fixed-width predicates.
2191 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2192 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2193 VT != MVT::v4i1 && VT != MVT::v2i1;
2194}
2195
2197 unsigned SearchSize) const {
2198 // MATCH is SVE2 and only available in non-streaming mode.
2199 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2200 return true;
2201 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2202 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2203 return SearchSize != 8;
2204 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2205 return SearchSize != 8 && SearchSize != 16;
2206 return true;
2207}
2208
2209void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2210 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2211
2212 // By default everything must be expanded.
2213 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2215
2216 if (VT.isFloatingPoint()) {
2226 }
2227
2229 VT == MVT::v1f64 ? Expand : Custom;
2230
2231 // Mark integer truncating stores/extending loads as having custom lowering
2232 if (VT.isInteger()) {
2233 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2234 while (InnerVT != VT) {
2235 setTruncStoreAction(VT, InnerVT, Default);
2236 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2237 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2238 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2239 InnerVT = InnerVT.changeVectorElementType(
2240 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2241 }
2242 }
2243
2244 // Mark floating-point truncating stores/extending loads as having custom
2245 // lowering
2246 if (VT.isFloatingPoint()) {
2247 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2248 while (InnerVT != VT) {
2249 setTruncStoreAction(VT, InnerVT, Custom);
2250 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2251 InnerVT = InnerVT.changeVectorElementType(
2253 }
2254 }
2255
2256 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2257 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2258
2259 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2260 ISD::PARTIAL_REDUCE_UMLA};
2261 unsigned NumElts = VT.getVectorNumElements();
2262 if (VT.getVectorElementType() == MVT::i64) {
2263 setPartialReduceMLAAction(MLAOps, VT,
2264 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2265 setPartialReduceMLAAction(MLAOps, VT,
2266 MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
2267 setPartialReduceMLAAction(MLAOps, VT,
2268 MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
2269 } else if (VT.getVectorElementType() == MVT::i32) {
2270 setPartialReduceMLAAction(MLAOps, VT,
2271 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2272 setPartialReduceMLAAction(MLAOps, VT,
2273 MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
2274 } else if (VT.getVectorElementType() == MVT::i16) {
2275 setPartialReduceMLAAction(MLAOps, VT,
2276 MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
2277 }
2278 if (Subtarget->hasMatMulInt8()) {
2279 if (VT.getVectorElementType() == MVT::i32)
2280 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
2281 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2282 else if (VT.getVectorElementType() == MVT::i64)
2283 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
2284 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2285 }
2286
2287 // Lower fixed length vector operations to scalable equivalents.
2294 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2304 setOperationAction(ISD::FABS, VT, Default);
2306 setOperationAction(ISD::FCEIL, VT, Default);
2309 setOperationAction(ISD::FFLOOR, VT, Default);
2311 setOperationAction(ISD::FMAXIMUM, VT, Default);
2312 setOperationAction(ISD::FMAXNUM, VT, Default);
2313 setOperationAction(ISD::FMINIMUM, VT, Default);
2314 setOperationAction(ISD::FMINNUM, VT, Default);
2316 setOperationAction(ISD::FNEARBYINT, VT, Default);
2317 setOperationAction(ISD::FNEG, VT, Default);
2318 setOperationAction(ISD::FP_EXTEND, VT, Default);
2322 setOperationAction(ISD::FRINT, VT, Default);
2323 setOperationAction(ISD::LRINT, VT, Default);
2324 setOperationAction(ISD::LLRINT, VT, Default);
2325 setOperationAction(ISD::FROUND, VT, Default);
2326 setOperationAction(ISD::FROUNDEVEN, VT, Default);
2327 setOperationAction(ISD::FSQRT, VT, Default);
2329 setOperationAction(ISD::FTRUNC, VT, Default);
2330 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Default);
2332 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2333 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2334 setOperationAction(ISD::MLOAD, VT, Default);
2335 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2336 setOperationAction(ISD::MSTORE, VT, Default);
2354 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2361 setOperationAction(ISD::VECREDUCE_ADD, VT, Default);
2362 setOperationAction(ISD::VECREDUCE_AND, VT, Default);
2363 setOperationAction(ISD::VECREDUCE_FADD, VT, Default);
2364 setOperationAction(ISD::VECREDUCE_FMAX, VT, Default);
2365 setOperationAction(ISD::VECREDUCE_FMIN, VT, Default);
2366 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Default);
2367 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Default);
2368 setOperationAction(ISD::VECREDUCE_OR, VT, Default);
2369 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, PreferSVE ? Default : Expand);
2370 setOperationAction(ISD::VECREDUCE_SMAX, VT, Default);
2371 setOperationAction(ISD::VECREDUCE_SMIN, VT, Default);
2372 setOperationAction(ISD::VECREDUCE_UMAX, VT, Default);
2373 setOperationAction(ISD::VECREDUCE_UMIN, VT, Default);
2374 setOperationAction(ISD::VECREDUCE_XOR, VT, Default);
2380}
2381
2382void AArch64TargetLowering::addDRType(MVT VT) {
2383 addRegisterClass(VT, &AArch64::FPR64RegClass);
2384 if (Subtarget->isNeonAvailable())
2385 addTypeForNEON(VT);
2386}
2387
2388void AArch64TargetLowering::addQRType(MVT VT) {
2389 addRegisterClass(VT, &AArch64::FPR128RegClass);
2390 if (Subtarget->isNeonAvailable())
2391 addTypeForNEON(VT);
2392}
2393
2395 LLVMContext &C, EVT VT) const {
2396 if (!VT.isVector())
2397 return MVT::i32;
2398 if (VT.isScalableVector())
2399 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2401}
2402
2403// isIntImmediate - This method tests to see if the node is a constant
2404// operand. If so Imm will receive the value.
2405static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2407 Imm = C->getZExtValue();
2408 return true;
2409 }
2410 return false;
2411}
2412
2413bool isVectorizedBinOp(unsigned Opcode) {
2414 switch (Opcode) {
2415 case AArch64ISD::SQDMULH:
2416 return true;
2417 default:
2418 return false;
2419 }
2420}
2421
2422// isOpcWithIntImmediate - This method tests to see if the node is a specific
2423// opcode and that it has a immediate integer right operand.
2424// If so Imm will receive the value.
2425static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2426 uint64_t &Imm) {
2427 return N->getOpcode() == Opc &&
2428 isIntImmediate(N->getOperand(1).getNode(), Imm);
2429}
2430
2431static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2432 const APInt &Demanded,
2434 unsigned NewOpc) {
2435 uint64_t OldImm = Imm, NewImm, Enc;
2436 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2437
2438 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2439 // bimm64.
2440 if (Imm == 0 || Imm == Mask ||
2442 return false;
2443
2444 unsigned EltSize = Size;
2445 uint64_t DemandedBits = Demanded.getZExtValue();
2446
2447 // Clear bits that are not demanded.
2448 Imm &= DemandedBits;
2449
2450 while (true) {
2451 // The goal here is to set the non-demanded bits in a way that minimizes
2452 // the number of switching between 0 and 1. In order to achieve this goal,
2453 // we set the non-demanded bits to the value of the preceding demanded bits.
2454 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2455 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2456 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2457 // The final result is 0b11000011.
2458 uint64_t NonDemandedBits = ~DemandedBits;
2459 uint64_t InvertedImm = ~Imm & DemandedBits;
2460 uint64_t RotatedImm =
2461 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2462 NonDemandedBits;
2463 uint64_t Sum = RotatedImm + NonDemandedBits;
2464 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2465 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2466 NewImm = (Imm | Ones) & Mask;
2467
2468 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2469 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2470 // we halve the element size and continue the search.
2471 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2472 break;
2473
2474 // We cannot shrink the element size any further if it is 2-bits.
2475 if (EltSize == 2)
2476 return false;
2477
2478 EltSize /= 2;
2479 Mask >>= EltSize;
2480 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2481
2482 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2483 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2484 return false;
2485
2486 // Merge the upper and lower halves of Imm and DemandedBits.
2487 Imm |= Hi;
2488 DemandedBits |= DemandedBitsHi;
2489 }
2490
2491 ++NumOptimizedImms;
2492
2493 // Replicate the element across the register width.
2494 while (EltSize < Size) {
2495 NewImm |= NewImm << EltSize;
2496 EltSize *= 2;
2497 }
2498
2499 (void)OldImm;
2500 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2501 "demanded bits should never be altered");
2502 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2503
2504 // Create the new constant immediate node.
2505 EVT VT = Op.getValueType();
2506 SDLoc DL(Op);
2507 SDValue New;
2508
2509 // If the new constant immediate is all-zeros or all-ones, let the target
2510 // independent DAG combine optimize this node.
2511 if (NewImm == 0 || NewImm == OrigMask) {
2512 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2513 TLO.DAG.getConstant(NewImm, DL, VT));
2514 // Otherwise, create a machine node so that target independent DAG combine
2515 // doesn't undo this optimization.
2516 } else {
2518 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2519 New = SDValue(
2520 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2521 }
2522
2523 return TLO.CombineTo(Op, New);
2524}
2525
2527 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2528 TargetLoweringOpt &TLO) const {
2529 // Delay this optimization to as late as possible.
2530 if (!TLO.LegalOps)
2531 return false;
2532
2534 return false;
2535
2536 EVT VT = Op.getValueType();
2537 if (VT.isVector())
2538 return false;
2539
2540 unsigned Size = VT.getSizeInBits();
2541
2542 if (Size != 32 && Size != 64)
2543 return false;
2544
2545 // Exit early if we demand all bits.
2546 if (DemandedBits.popcount() == Size)
2547 return false;
2548
2549 unsigned NewOpc;
2550 switch (Op.getOpcode()) {
2551 default:
2552 return false;
2553 case ISD::AND:
2554 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2555 break;
2556 case ISD::OR:
2557 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2558 break;
2559 case ISD::XOR:
2560 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2561 break;
2562 }
2563 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2564 if (!C)
2565 return false;
2566 uint64_t Imm = C->getZExtValue();
2567 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2568}
2569
2570/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2571/// Mask are known to be either zero or one and return them Known.
2573 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2574 const SelectionDAG &DAG, unsigned Depth) const {
2575 switch (Op.getOpcode()) {
2576 default:
2577 break;
2578 case AArch64ISD::DUP: {
2579 SDValue SrcOp = Op.getOperand(0);
2580 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2581 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2582 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2583 "Expected DUP implicit truncation");
2584 Known = Known.trunc(Op.getScalarValueSizeInBits());
2585 }
2586 break;
2587 }
2588 case AArch64ISD::CSEL: {
2589 KnownBits Known2;
2590 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2591 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2592 Known = Known.intersectWith(Known2);
2593 break;
2594 }
2595 case AArch64ISD::CSNEG:
2596 case AArch64ISD::CSINC:
2597 case AArch64ISD::CSINV: {
2598 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2599 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2600
2601 // The result is either:
2602 // CSINC: KnownOp0 or KnownOp1 + 1
2603 // CSINV: KnownOp0 or ~KnownOp1
2604 // CSNEG: KnownOp0 or KnownOp1 * -1
2605 if (Op.getOpcode() == AArch64ISD::CSINC)
2606 KnownOp1 = KnownBits::add(
2607 KnownOp1,
2608 KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1)));
2609 else if (Op.getOpcode() == AArch64ISD::CSINV)
2610 std::swap(KnownOp1.Zero, KnownOp1.One);
2611 else if (Op.getOpcode() == AArch64ISD::CSNEG)
2612 KnownOp1 =
2614 Op.getScalarValueSizeInBits())));
2615
2616 Known = KnownOp0.intersectWith(KnownOp1);
2617 break;
2618 }
2619 case AArch64ISD::BICi: {
2620 // Compute the bit cleared value.
2621 APInt Mask =
2622 ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2623 .trunc(Known.getBitWidth());
2624 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2625 Known &= KnownBits::makeConstant(Mask);
2626 break;
2627 }
2628 case AArch64ISD::VLSHR: {
2629 KnownBits Known2;
2630 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2631 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2632 Known = KnownBits::lshr(Known, Known2);
2633 break;
2634 }
2635 case AArch64ISD::VASHR: {
2636 KnownBits Known2;
2637 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2638 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2639 Known = KnownBits::ashr(Known, Known2);
2640 break;
2641 }
2642 case AArch64ISD::VSHL: {
2643 KnownBits Known2;
2644 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2645 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2646 Known = KnownBits::shl(Known, Known2);
2647 break;
2648 }
2649 case AArch64ISD::MOVI: {
2651 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2652 break;
2653 }
2654 case AArch64ISD::MOVIshift: {
2656 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)
2657 << Op->getConstantOperandVal(1)));
2658 break;
2659 }
2660 case AArch64ISD::MOVImsl: {
2661 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2663 Known.getBitWidth(), ~(~Op->getConstantOperandVal(0) << ShiftAmt)));
2664 break;
2665 }
2666 case AArch64ISD::MOVIedit: {
2668 Known.getBitWidth(),
2669 AArch64_AM::decodeAdvSIMDModImmType10(Op->getConstantOperandVal(0))));
2670 break;
2671 }
2672 case AArch64ISD::MVNIshift: {
2674 APInt(Known.getBitWidth(),
2675 ~(Op->getConstantOperandVal(0) << Op->getConstantOperandVal(1)),
2676 /*isSigned*/ false, /*implicitTrunc*/ true));
2677 break;
2678 }
2679 case AArch64ISD::MVNImsl: {
2680 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2682 APInt(Known.getBitWidth(), (~Op->getConstantOperandVal(0) << ShiftAmt),
2683 /*isSigned*/ false, /*implicitTrunc*/ true));
2684 break;
2685 }
2686 case AArch64ISD::LOADgot:
2687 case AArch64ISD::ADDlow: {
2688 if (!Subtarget->isTargetILP32())
2689 break;
2690 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2691 Known.Zero = APInt::getHighBitsSet(64, 32);
2692 break;
2693 }
2694 case AArch64ISD::ASSERT_ZEXT_BOOL: {
2695 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2696 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2697 break;
2698 }
2700 Intrinsic::ID IntID =
2701 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2702 switch (IntID) {
2703 default: return;
2704 case Intrinsic::aarch64_ldaxr:
2705 case Intrinsic::aarch64_ldxr: {
2706 unsigned BitWidth = Known.getBitWidth();
2707 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2708 unsigned MemBits = VT.getScalarSizeInBits();
2709 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2710 return;
2711 }
2712 }
2713 break;
2714 }
2716 case ISD::INTRINSIC_VOID: {
2717 unsigned IntNo = Op.getConstantOperandVal(0);
2718 switch (IntNo) {
2719 default:
2720 break;
2721 case Intrinsic::aarch64_neon_uaddlv: {
2722 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2723 unsigned BitWidth = Known.getBitWidth();
2724 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2725 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2726 assert(BitWidth >= Bound && "Unexpected width!");
2728 Known.Zero |= Mask;
2729 }
2730 break;
2731 }
2732 case Intrinsic::aarch64_neon_umaxv:
2733 case Intrinsic::aarch64_neon_uminv: {
2734 // Figure out the datatype of the vector operand. The UMINV instruction
2735 // will zero extend the result, so we can mark as known zero all the
2736 // bits larger than the element datatype. 32-bit or larget doesn't need
2737 // this as those are legal types and will be handled by isel directly.
2738 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2739 unsigned BitWidth = Known.getBitWidth();
2740 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2741 assert(BitWidth >= 8 && "Unexpected width!");
2743 Known.Zero |= Mask;
2744 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2745 assert(BitWidth >= 16 && "Unexpected width!");
2747 Known.Zero |= Mask;
2748 }
2749 break;
2750 } break;
2751 }
2752 }
2753 }
2754}
2755
2757 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2758 unsigned Depth) const {
2759 EVT VT = Op.getValueType();
2760 unsigned VTBits = VT.getScalarSizeInBits();
2761 unsigned Opcode = Op.getOpcode();
2762 switch (Opcode) {
2763 case AArch64ISD::FCMEQ:
2764 case AArch64ISD::FCMGE:
2765 case AArch64ISD::FCMGT:
2766 // Compares return either 0 or all-ones
2767 return VTBits;
2768 case AArch64ISD::VASHR: {
2769 unsigned Tmp =
2770 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2771 return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2772 }
2773 }
2774
2775 return 1;
2776}
2777
2779 EVT) const {
2780 return MVT::i64;
2781}
2782
2784 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2785 unsigned *Fast) const {
2786
2787 // Allow SVE loads/stores where the alignment >= the size of the element type,
2788 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2789 // for stores that come from IR, only require element-size alignment (even if
2790 // unaligned accesses are disabled). Without this, these will be forced to
2791 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2792 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2793 if (VT.isScalableVector()) {
2794 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2795 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2796 return true;
2797 }
2798
2799 if (Subtarget->requiresStrictAlign())
2800 return false;
2801
2802 if (Fast) {
2803 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2804 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2805 // See comments in performSTORECombine() for more details about
2806 // these conditions.
2807
2808 // Code that uses clang vector extensions can mark that it
2809 // wants unaligned accesses to be treated as fast by
2810 // underspecifying alignment to be 1 or 2.
2811 Alignment <= 2 ||
2812
2813 // Disregard v2i64. Memcpy lowering produces those and splitting
2814 // them regresses performance on micro-benchmarks and olden/bh.
2815 VT == MVT::v2i64;
2816 }
2817 return true;
2818}
2819
2820// Same as above but handling LLTs instead.
2822 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2823 unsigned *Fast) const {
2824 if (Subtarget->requiresStrictAlign())
2825 return false;
2826
2827 if (Fast) {
2828 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2829 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2830 Ty.getSizeInBytes() != 16 ||
2831 // See comments in performSTORECombine() for more details about
2832 // these conditions.
2833
2834 // Code that uses clang vector extensions can mark that it
2835 // wants unaligned accesses to be treated as fast by
2836 // underspecifying alignment to be 1 or 2.
2837 Alignment <= 2 ||
2838
2839 // Disregard v2i64. Memcpy lowering produces those and splitting
2840 // them regresses performance on micro-benchmarks and olden/bh.
2841 Ty == LLT::fixed_vector(2, 64);
2842 }
2843 return true;
2844}
2845
2846FastISel *
2848 const TargetLibraryInfo *libInfo) const {
2849 return AArch64::createFastISel(funcInfo, libInfo);
2850}
2851
2854 MachineBasicBlock *MBB) const {
2855 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2856 // phi node:
2857
2858 // OrigBB:
2859 // [... previous instrs leading to comparison ...]
2860 // b.ne TrueBB
2861 // b EndBB
2862 // TrueBB:
2863 // ; Fallthrough
2864 // EndBB:
2865 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2866
2867 MachineFunction *MF = MBB->getParent();
2868 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2869 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2870 DebugLoc DL = MI.getDebugLoc();
2871 MachineFunction::iterator It = ++MBB->getIterator();
2872
2873 Register DestReg = MI.getOperand(0).getReg();
2874 Register IfTrueReg = MI.getOperand(1).getReg();
2875 Register IfFalseReg = MI.getOperand(2).getReg();
2876 unsigned CondCode = MI.getOperand(3).getImm();
2877 bool NZCVKilled = MI.getOperand(4).isKill();
2878
2879 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2880 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2881 MF->insert(It, TrueBB);
2882 MF->insert(It, EndBB);
2883
2884 // Transfer rest of current basic-block to EndBB
2885 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2886 MBB->end());
2888
2889 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2890 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2891 MBB->addSuccessor(TrueBB);
2892 MBB->addSuccessor(EndBB);
2893
2894 // TrueBB falls through to the end.
2895 TrueBB->addSuccessor(EndBB);
2896
2897 if (!NZCVKilled) {
2898 TrueBB->addLiveIn(AArch64::NZCV);
2899 EndBB->addLiveIn(AArch64::NZCV);
2900 }
2901
2902 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2903 .addReg(IfTrueReg)
2904 .addMBB(TrueBB)
2905 .addReg(IfFalseReg)
2906 .addMBB(MBB);
2907
2908 MI.eraseFromParent();
2909 return EndBB;
2910}
2911
2919
2922 MachineBasicBlock *MBB) const {
2923 MachineFunction &MF = *MBB->getParent();
2924 MachineBasicBlock::iterator MBBI = MI.getIterator();
2925 const AArch64InstrInfo &TII =
2926 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2927 Register TargetReg = MI.getOperand(0).getReg();
2929 TII.probedStackAlloc(MBBI, TargetReg, false);
2930
2931 MI.eraseFromParent();
2932 return NextInst->getParent();
2933}
2934
2937 MachineBasicBlock *MBB) const {
2938 MachineFunction *MF = MBB->getParent();
2940
2941 const TargetRegisterClass *RC_GPR = &AArch64::GPR64RegClass;
2942 const TargetRegisterClass *RC_GPRsp = &AArch64::GPR64spRegClass;
2943
2944 Register RegVL_GPR = MRI.createVirtualRegister(RC_GPR);
2945 Register RegVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL src
2946 Register RegSVL_GPR = MRI.createVirtualRegister(RC_GPR);
2947 Register RegSVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL dst
2948
2949 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2950 DebugLoc DL = MI.getDebugLoc();
2951
2952 // RDVL requires GPR64, ADDSVL requires GPR64sp
2953 // We need to insert COPY instructions, these will later be removed by the
2954 // RegisterCoalescer
2955 BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL_GPR).addImm(1);
2956 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegVL_GPRsp)
2957 .addReg(RegVL_GPR);
2958
2959 BuildMI(*MBB, MI, DL, TII->get(AArch64::ADDSVL_XXI), RegSVL_GPRsp)
2960 .addReg(RegVL_GPRsp)
2961 .addImm(-1);
2962 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegSVL_GPR)
2963 .addReg(RegSVL_GPRsp);
2964
2965 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2966 MachineFunction::iterator It = ++MBB->getIterator();
2967 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB);
2968 MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB);
2969 MF->insert(It, TrapBB);
2970 MF->insert(It, PassBB);
2971
2972 // Continue if vector lengths match
2973 BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX))
2974 .addReg(RegSVL_GPR)
2975 .addMBB(PassBB);
2976
2977 // Transfer rest of current BB to PassBB
2978 PassBB->splice(PassBB->begin(), MBB,
2979 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
2981
2982 // Trap if vector lengths mismatch
2983 BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1);
2984
2985 MBB->addSuccessor(TrapBB);
2986 MBB->addSuccessor(PassBB);
2987
2988 MI.eraseFromParent();
2989 return PassBB;
2990}
2991
2993AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2995 MachineBasicBlock *BB) const {
2996 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2997 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2998
2999 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
3000 MIB.add(MI.getOperand(1)); // slice index register
3001 MIB.add(MI.getOperand(2)); // slice index offset
3002 MIB.add(MI.getOperand(3)); // pg
3003 MIB.add(MI.getOperand(4)); // base
3004 MIB.add(MI.getOperand(5)); // offset
3005
3006 MI.eraseFromParent(); // The pseudo is gone now.
3007 return BB;
3008}
3009
3012 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3014 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
3015
3016 MIB.addReg(AArch64::ZA, RegState::Define);
3017 MIB.add(MI.getOperand(0)); // Vector select register
3018 MIB.add(MI.getOperand(1)); // Vector select offset
3019 MIB.add(MI.getOperand(2)); // Base
3020 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
3021
3022 MI.eraseFromParent(); // The pseudo is gone now.
3023 return BB;
3024}
3025
3028 unsigned Opcode,
3029 bool Op0IsDef) const {
3030 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3032
3033 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
3034 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
3035 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
3036 MIB.add(MI.getOperand(I));
3037
3038 MI.eraseFromParent(); // The pseudo is gone now.
3039 return BB;
3040}
3041
3043AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
3045 MachineBasicBlock *BB) const {
3046 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3047 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3048 unsigned StartIdx = 0;
3049
3050 bool HasTile = BaseReg != AArch64::ZA;
3051 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3052 if (HasZPROut) {
3053 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3054 ++StartIdx;
3055 }
3056 if (HasTile) {
3057 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3058 RegState::Define); // Output ZA Tile
3059 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3060 StartIdx++;
3061 } else {
3062 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3063 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3064 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3065 ++StartIdx;
3066 }
3067 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3068 }
3069 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3070 MIB.add(MI.getOperand(I));
3071
3072 MI.eraseFromParent(); // The pseudo is gone now.
3073 return BB;
3074}
3075
3078 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3080 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3081 MIB.add(MI.getOperand(0)); // Mask
3082
3083 unsigned Mask = MI.getOperand(0).getImm();
3084 for (unsigned I = 0; I < 8; I++) {
3085 if (Mask & (1 << I))
3086 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3087 }
3088
3089 MI.eraseFromParent(); // The pseudo is gone now.
3090 return BB;
3091}
3092
3095 MachineBasicBlock *BB) const {
3096 MachineFunction *MF = BB->getParent();
3097 MachineFrameInfo &MFI = MF->getFrameInfo();
3099 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3100 if (TPIDR2.Uses > 0) {
3101 // Note: This case just needs to do `SVL << 48`. It is not implemented as we
3102 // generally don't support big-endian SVE/SME.
3103 if (!Subtarget->isLittleEndian())
3105 "TPIDR2 block initialization is not supported on big-endian targets");
3106
3107 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3108 // Store buffer pointer and num_za_save_slices.
3109 // Bytes 10-15 are implicitly zeroed.
3110 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi))
3111 .addReg(MI.getOperand(0).getReg())
3112 .addReg(MI.getOperand(1).getReg())
3113 .addFrameIndex(TPIDR2.FrameIndex)
3114 .addImm(0);
3115 } else
3116 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3117
3118 BB->remove_instr(&MI);
3119 return BB;
3120}
3121
3124 MachineBasicBlock *BB) const {
3125 MachineFunction *MF = BB->getParent();
3126 MachineFrameInfo &MFI = MF->getFrameInfo();
3128 // TODO This function grows the stack with a subtraction, which doesn't work
3129 // on Windows. Some refactoring to share the functionality in
3130 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3131 // supports SME
3133 "Lazy ZA save is not yet supported on Windows");
3134
3135 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3136
3137 if (TPIDR2.Uses > 0) {
3138 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3140
3141 // The SUBXrs below won't always be emitted in a form that accepts SP
3142 // directly
3143 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3144 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3145 .addReg(AArch64::SP);
3146
3147 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3148 auto Size = MI.getOperand(1).getReg();
3149 auto Dest = MI.getOperand(0).getReg();
3150 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3151 .addReg(Size)
3152 .addReg(Size)
3153 .addReg(SP);
3154 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3155 AArch64::SP)
3156 .addReg(Dest);
3157
3158 // We have just allocated a variable sized object, tell this to PEI.
3159 MFI.CreateVariableSizedObject(Align(16), nullptr);
3160 }
3161
3162 BB->remove_instr(&MI);
3163 return BB;
3164}
3165
3166// TODO: Find a way to merge this with EmitAllocateZABuffer.
3169 MachineBasicBlock *BB) const {
3170 MachineFunction *MF = BB->getParent();
3171 MachineFrameInfo &MFI = MF->getFrameInfo();
3174 "Lazy ZA save is not yet supported on Windows");
3175
3176 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3177 if (FuncInfo->isSMESaveBufferUsed()) {
3178 // Allocate a buffer object of the size given by MI.getOperand(1).
3179 auto Size = MI.getOperand(1).getReg();
3180 auto Dest = MI.getOperand(0).getReg();
3181 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrx64), AArch64::SP)
3182 .addReg(AArch64::SP)
3183 .addReg(Size)
3185 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Dest)
3186 .addReg(AArch64::SP);
3187
3188 // We have just allocated a variable sized object, tell this to PEI.
3189 MFI.CreateVariableSizedObject(Align(16), nullptr);
3190 } else
3191 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
3192 MI.getOperand(0).getReg());
3193
3194 BB->remove_instr(&MI);
3195 return BB;
3196}
3197
3200 MachineBasicBlock *BB) const {
3201 // If the buffer is used, emit a call to __arm_sme_state_size()
3202 MachineFunction *MF = BB->getParent();
3204 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3205 if (FuncInfo->isSMESaveBufferUsed()) {
3206 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
3207 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3208 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
3210 .addReg(AArch64::X0, RegState::ImplicitDefine)
3211 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3212 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3213 MI.getOperand(0).getReg())
3214 .addReg(AArch64::X0);
3215 } else
3216 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3217 MI.getOperand(0).getReg())
3218 .addReg(AArch64::XZR);
3219 BB->remove_instr(&MI);
3220 return BB;
3221}
3222
3225 MachineBasicBlock *BB) const {
3226 MachineFunction *MF = BB->getParent();
3227 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3228 const DebugLoc &DL = MI.getDebugLoc();
3229 Register ResultReg = MI.getOperand(0).getReg();
3230 if (MF->getRegInfo().use_empty(ResultReg)) {
3231 // Nothing to do. Pseudo erased below.
3232 } else if (Subtarget->hasSME()) {
3233 BuildMI(*BB, MI, DL, TII->get(AArch64::MRS), ResultReg)
3234 .addImm(AArch64SysReg::SVCR)
3235 .addReg(AArch64::VG, RegState::Implicit);
3236 } else {
3237 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
3238 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3239 BuildMI(*BB, MI, DL, TII->get(AArch64::BL))
3241 .addReg(AArch64::X0, RegState::ImplicitDefine)
3242 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3243 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), ResultReg)
3244 .addReg(AArch64::X0);
3245 }
3246 MI.eraseFromParent();
3247 return BB;
3248}
3249
3250// Helper function to find the instruction that defined a virtual register.
3251// If unable to find such instruction, returns nullptr.
3253 Register Reg) {
3254 while (Reg.isVirtual()) {
3255 MachineInstr *DefMI = MRI.getVRegDef(Reg);
3256 assert(DefMI && "Virtual register definition not found");
3257 unsigned Opcode = DefMI->getOpcode();
3258
3259 if (Opcode == AArch64::COPY) {
3260 Reg = DefMI->getOperand(1).getReg();
3261 // Vreg is defined by copying from physreg.
3262 if (Reg.isPhysical())
3263 return DefMI;
3264 continue;
3265 }
3266 if (Opcode == AArch64::SUBREG_TO_REG) {
3267 Reg = DefMI->getOperand(2).getReg();
3268 continue;
3269 }
3270
3271 return DefMI;
3272 }
3273 return nullptr;
3274}
3275
3278 MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const {
3279 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3280 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
3281 const DebugLoc &DL = MI.getDebugLoc();
3282
3283 Register AddrDisc = AddrDiscOp.getReg();
3284 int64_t IntDisc = IntDiscOp.getImm();
3285 assert(IntDisc == 0 && "Blend components are already expanded");
3286
3287 const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc);
3288 if (DiscMI) {
3289 switch (DiscMI->getOpcode()) {
3290 case AArch64::MOVKXi:
3291 // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48".
3292 // #imm should be an immediate and not a global symbol, for example.
3293 if (DiscMI->getOperand(2).isImm() &&
3294 DiscMI->getOperand(3).getImm() == 48) {
3295 AddrDisc = DiscMI->getOperand(1).getReg();
3296 IntDisc = DiscMI->getOperand(2).getImm();
3297 }
3298 break;
3299 case AArch64::MOVi32imm:
3300 case AArch64::MOVi64imm:
3301 // Small immediate integer constant passed via VReg.
3302 if (DiscMI->getOperand(1).isImm() &&
3303 isUInt<16>(DiscMI->getOperand(1).getImm())) {
3304 AddrDisc = AArch64::NoRegister;
3305 IntDisc = DiscMI->getOperand(1).getImm();
3306 }
3307 break;
3308 }
3309 }
3310
3311 // For uniformity, always use NoRegister, as XZR is not necessarily contained
3312 // in the requested register class.
3313 if (AddrDisc == AArch64::XZR)
3314 AddrDisc = AArch64::NoRegister;
3315
3316 // Make sure AddrDisc operand respects the register class imposed by MI.
3317 if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) {
3318 Register TmpReg = MRI.createVirtualRegister(AddrDiscRC);
3319 BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc);
3320 AddrDisc = TmpReg;
3321 }
3322
3323 AddrDiscOp.setReg(AddrDisc);
3324 IntDiscOp.setImm(IntDisc);
3325}
3326
3328 MachineInstr &MI, MachineBasicBlock *BB) const {
3329
3330 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3331 if (SMEOrigInstr != -1) {
3332 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3333 uint64_t SMEMatrixType =
3334 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3335 switch (SMEMatrixType) {
3337 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3339 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3341 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3343 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3345 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3347 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3348 }
3349 }
3350
3351 switch (MI.getOpcode()) {
3352 default:
3353#ifndef NDEBUG
3354 MI.dump();
3355#endif
3356 llvm_unreachable("Unexpected instruction for custom inserter!");
3357 case AArch64::InitTPIDR2Obj:
3358 return EmitInitTPIDR2Object(MI, BB);
3359 case AArch64::AllocateZABuffer:
3360 return EmitAllocateZABuffer(MI, BB);
3361 case AArch64::AllocateSMESaveBuffer:
3362 return EmitAllocateSMESaveBuffer(MI, BB);
3363 case AArch64::GetSMESaveSize:
3364 return EmitGetSMESaveSize(MI, BB);
3365 case AArch64::EntryPStateSM:
3366 return EmitEntryPStateSM(MI, BB);
3367 case AArch64::F128CSEL:
3368 return EmitF128CSEL(MI, BB);
3369 case TargetOpcode::STATEPOINT:
3370 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3371 // while bl call instruction (where statepoint will be lowered at the end)
3372 // has implicit def. This def is early-clobber as it will be set at
3373 // the moment of the call and earlier than any use is read.
3374 // Add this implicit dead def here as a workaround.
3375 MI.addOperand(*MI.getMF(),
3377 AArch64::LR, /*isDef*/ true,
3378 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3379 /*isUndef*/ false, /*isEarlyClobber*/ true));
3380 [[fallthrough]];
3381 case TargetOpcode::STACKMAP:
3382 case TargetOpcode::PATCHPOINT:
3383 return emitPatchPoint(MI, BB);
3384
3385 case TargetOpcode::PATCHABLE_EVENT_CALL:
3386 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3387 return BB;
3388
3389 case AArch64::CATCHRET:
3390 return EmitLoweredCatchRet(MI, BB);
3391
3392 case AArch64::PROBED_STACKALLOC_DYN:
3393 return EmitDynamicProbedAlloc(MI, BB);
3394
3395 case AArch64::CHECK_MATCHING_VL_PSEUDO:
3396 return EmitCheckMatchingVL(MI, BB);
3397
3398 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3399 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3400 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3401 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3402 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3403 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3404 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3405 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3406 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3407 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3408 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3409 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3410 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3411 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3412 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3413 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3414 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3415 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3416 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3417 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3418 case AArch64::LDR_ZA_PSEUDO:
3419 return EmitFill(MI, BB);
3420 case AArch64::LDR_TX_PSEUDO:
3421 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3422 case AArch64::STR_TX_PSEUDO:
3423 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3424 case AArch64::ZERO_M_PSEUDO:
3425 return EmitZero(MI, BB);
3426 case AArch64::ZERO_T_PSEUDO:
3427 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3428 case AArch64::MOVT_TIZ_PSEUDO:
3429 return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3430
3431 case AArch64::PAC:
3432 fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4),
3433 &AArch64::GPR64noipRegClass);
3434 return BB;
3435 }
3436}
3437
3438//===----------------------------------------------------------------------===//
3439// AArch64 Lowering private implementation.
3440//===----------------------------------------------------------------------===//
3441
3442//===----------------------------------------------------------------------===//
3443// Lowering Code
3444//===----------------------------------------------------------------------===//
3445
3446// Forward declarations of SVE fixed length lowering helpers
3451 SelectionDAG &DAG);
3454 EVT VT);
3455
3456/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3457static bool isZerosVector(const SDNode *N) {
3458 // Look through a bit convert.
3459 while (N->getOpcode() == ISD::BITCAST)
3460 N = N->getOperand(0).getNode();
3461
3463 return true;
3464
3465 if (N->getOpcode() != AArch64ISD::DUP)
3466 return false;
3467
3468 auto Opnd0 = N->getOperand(0);
3469 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3470}
3471
3472/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3473/// CC
3475 SDValue RHS = {}) {
3476 switch (CC) {
3477 default:
3478 llvm_unreachable("Unknown condition code!");
3479 case ISD::SETNE:
3480 return AArch64CC::NE;
3481 case ISD::SETEQ:
3482 return AArch64CC::EQ;
3483 case ISD::SETGT:
3484 return AArch64CC::GT;
3485 case ISD::SETGE:
3487 case ISD::SETLT:
3489 case ISD::SETLE:
3490 return AArch64CC::LE;
3491 case ISD::SETUGT:
3492 return AArch64CC::HI;
3493 case ISD::SETUGE:
3494 return AArch64CC::HS;
3495 case ISD::SETULT:
3496 return AArch64CC::LO;
3497 case ISD::SETULE:
3498 return AArch64CC::LS;
3499 }
3500}
3501
3502/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3504 AArch64CC::CondCode &CondCode,
3505 AArch64CC::CondCode &CondCode2) {
3506 CondCode2 = AArch64CC::AL;
3507 switch (CC) {
3508 default:
3509 llvm_unreachable("Unknown FP condition!");
3510 case ISD::SETEQ:
3511 case ISD::SETOEQ:
3512 CondCode = AArch64CC::EQ;
3513 break;
3514 case ISD::SETGT:
3515 case ISD::SETOGT:
3516 CondCode = AArch64CC::GT;
3517 break;
3518 case ISD::SETGE:
3519 case ISD::SETOGE:
3520 CondCode = AArch64CC::GE;
3521 break;
3522 case ISD::SETOLT:
3523 CondCode = AArch64CC::MI;
3524 break;
3525 case ISD::SETOLE:
3526 CondCode = AArch64CC::LS;
3527 break;
3528 case ISD::SETONE:
3529 CondCode = AArch64CC::MI;
3530 CondCode2 = AArch64CC::GT;
3531 break;
3532 case ISD::SETO:
3533 CondCode = AArch64CC::VC;
3534 break;
3535 case ISD::SETUO:
3536 CondCode = AArch64CC::VS;
3537 break;
3538 case ISD::SETUEQ:
3539 CondCode = AArch64CC::EQ;
3540 CondCode2 = AArch64CC::VS;
3541 break;
3542 case ISD::SETUGT:
3543 CondCode = AArch64CC::HI;
3544 break;
3545 case ISD::SETUGE:
3546 CondCode = AArch64CC::PL;
3547 break;
3548 case ISD::SETLT:
3549 case ISD::SETULT:
3550 CondCode = AArch64CC::LT;
3551 break;
3552 case ISD::SETLE:
3553 case ISD::SETULE:
3554 CondCode = AArch64CC::LE;
3555 break;
3556 case ISD::SETNE:
3557 case ISD::SETUNE:
3558 CondCode = AArch64CC::NE;
3559 break;
3560 }
3561}
3562
3563/// Convert a DAG fp condition code to an AArch64 CC.
3564/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3565/// should be AND'ed instead of OR'ed.
3567 AArch64CC::CondCode &CondCode,
3568 AArch64CC::CondCode &CondCode2) {
3569 CondCode2 = AArch64CC::AL;
3570 switch (CC) {
3571 default:
3572 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3573 assert(CondCode2 == AArch64CC::AL);
3574 break;
3575 case ISD::SETONE:
3576 // (a one b)
3577 // == ((a olt b) || (a ogt b))
3578 // == ((a ord b) && (a une b))
3579 CondCode = AArch64CC::VC;
3580 CondCode2 = AArch64CC::NE;
3581 break;
3582 case ISD::SETUEQ:
3583 // (a ueq b)
3584 // == ((a uno b) || (a oeq b))
3585 // == ((a ule b) && (a uge b))
3586 CondCode = AArch64CC::PL;
3587 CondCode2 = AArch64CC::LE;
3588 break;
3589 }
3590}
3591
3592/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3593/// CC usable with the vector instructions. Fewer operations are available
3594/// without a real NZCV register, so we have to use less efficient combinations
3595/// to get the same effect.
3597 AArch64CC::CondCode &CondCode,
3598 AArch64CC::CondCode &CondCode2,
3599 bool &Invert) {
3600 Invert = false;
3601 switch (CC) {
3602 default:
3603 // Mostly the scalar mappings work fine.
3604 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3605 break;
3606 case ISD::SETUO:
3607 Invert = true;
3608 [[fallthrough]];
3609 case ISD::SETO:
3610 CondCode = AArch64CC::MI;
3611 CondCode2 = AArch64CC::GE;
3612 break;
3613 case ISD::SETUEQ:
3614 case ISD::SETULT:
3615 case ISD::SETULE:
3616 case ISD::SETUGT:
3617 case ISD::SETUGE:
3618 // All of the compare-mask comparisons are ordered, but we can switch
3619 // between the two by a double inversion. E.g. ULE == !OGT.
3620 Invert = true;
3621 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3622 CondCode, CondCode2);
3623 break;
3624 }
3625}
3626
3627/// Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
3629 // TODO: Should be TargetConstant (need to s/imm/timm in patterns).
3630 return DAG.getConstant(CC, SDLoc(), CondCodeVT);
3631}
3632
3634 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3635 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3636 LLVM_DEBUG(dbgs() << "Is imm " << C
3637 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3638 return IsLegal;
3639}
3640
3642 // Works for negative immediates too, as it can be written as an ADDS
3643 // instruction with a negated immediate.
3644 return isLegalArithImmed(C.abs().getZExtValue());
3645}
3646
3648 uint64_t Imm = C.getZExtValue();
3650 AArch64_IMM::expandMOVImm(Imm, 32, Insn);
3651 return Insn.size();
3652}
3653
3655 // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
3656 if (Op->getFlags().hasNoSignedWrap())
3657 return true;
3658
3659 // We can still figure out if the second operand is safe to use
3660 // in a CMN instruction by checking if it is known to be not the minimum
3661 // signed value. If it is not, then we can safely use CMN.
3662 // Note: We can eventually remove this check and simply rely on
3663 // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
3664 // consistently sets them appropriately when making said nodes.
3665
3666 KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));
3667 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3668}
3669
3670// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3671// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3672// can be set differently by this operation. It comes down to whether
3673// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3674// everything is fine. If not then the optimization is wrong. Thus general
3675// comparisons are only valid if op2 != 0 and op2 != INT_MIN.
3676//
3677// So, finally, the only LLVM-native comparisons that don't mention C or V
3678// are the ones that aren't unsigned comparisons. They're the only ones we can
3679// safely use CMN for in the absence of information about op2.
3681 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3682 (isIntEqualitySetCC(CC) ||
3683 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3684 (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));
3685}
3686
3688 SelectionDAG &DAG, SDValue Chain,
3689 bool IsSignaling) {
3690 EVT VT = LHS.getValueType();
3691 assert(VT != MVT::f128);
3692
3693 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3694
3695 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3696 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3697 {Chain, LHS});
3698 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3699 {LHS.getValue(1), RHS});
3700 Chain = RHS.getValue(1);
3701 }
3702 unsigned Opcode =
3703 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3704 return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS});
3705}
3706
3708 const SDLoc &DL, SelectionDAG &DAG) {
3709 EVT VT = LHS.getValueType();
3710 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3711
3712 if (VT.isFloatingPoint()) {
3713 assert(VT != MVT::f128);
3714 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3715 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3716 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3717 }
3718 return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS);
3719 }
3720
3721 // The CMP instruction is just an alias for SUBS, and representing it as
3722 // SUBS means that it's possible to get CSE with subtract operations.
3723 // A later phase can perform the optimization of setting the destination
3724 // register to WZR/XZR if it ends up being unused.
3725 unsigned Opcode = AArch64ISD::SUBS;
3726
3727 if (isCMN(RHS, CC, DAG)) {
3728 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3729 Opcode = AArch64ISD::ADDS;
3730 RHS = RHS.getOperand(1);
3731 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3732 isIntEqualitySetCC(CC)) {
3733 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3734 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3735 Opcode = AArch64ISD::ADDS;
3736 LHS = LHS.getOperand(1);
3737 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3738 if (LHS.getOpcode() == ISD::AND) {
3739 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3740 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3741 // of the signed comparisons.
3742 const SDValue ANDSNode =
3743 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT),
3744 LHS.getOperand(0), LHS.getOperand(1));
3745 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3746 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3747 return ANDSNode.getValue(1);
3748 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3749 // Use result of ANDS
3750 return LHS.getValue(1);
3751 }
3752 }
3753
3754 return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS)
3755 .getValue(1);
3756}
3757
3758/// \defgroup AArch64CCMP CMP;CCMP matching
3759///
3760/// These functions deal with the formation of CMP;CCMP;... sequences.
3761/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3762/// a comparison. They set the NZCV flags to a predefined value if their
3763/// predicate is false. This allows to express arbitrary conjunctions, for
3764/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3765/// expressed as:
3766/// cmp A
3767/// ccmp B, inv(CB), CA
3768/// check for CB flags
3769///
3770/// This naturally lets us implement chains of AND operations with SETCC
3771/// operands. And we can even implement some other situations by transforming
3772/// them:
3773/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3774/// negating the flags used in a CCMP/FCCMP operations.
3775/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3776/// by negating the flags we test for afterwards. i.e.
3777/// NEG (CMP CCMP CCCMP ...) can be implemented.
3778/// - Note that we can only ever negate all previously processed results.
3779/// What we can not implement by flipping the flags to test is a negation
3780/// of two sub-trees (because the negation affects all sub-trees emitted so
3781/// far, so the 2nd sub-tree we emit would also affect the first).
3782/// With those tools we can implement some OR operations:
3783/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3784/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3785/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3786/// elimination rules from earlier to implement the whole thing as a
3787/// CCMP/FCCMP chain.
3788///
3789/// As complete example:
3790/// or (or (setCA (cmp A)) (setCB (cmp B)))
3791/// (and (setCC (cmp C)) (setCD (cmp D)))"
3792/// can be reassociated to:
3793/// or (and (setCC (cmp C)) setCD (cmp D))
3794// (or (setCA (cmp A)) (setCB (cmp B)))
3795/// can be transformed to:
3796/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3797/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3798/// which can be implemented as:
3799/// cmp C
3800/// ccmp D, inv(CD), CC
3801/// ccmp A, CA, inv(CD)
3802/// ccmp B, CB, inv(CA)
3803/// check for CB flags
3804///
3805/// A counterexample is "or (and A B) (and C D)" which translates to
3806/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3807/// can only implement 1 of the inner (not) operations, but not both!
3808/// @{
3809
3810/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3812 ISD::CondCode CC, SDValue CCOp,
3814 AArch64CC::CondCode OutCC,
3815 const SDLoc &DL, SelectionDAG &DAG) {
3816 unsigned Opcode = 0;
3817 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3818
3819 if (LHS.getValueType().isFloatingPoint()) {
3820 assert(LHS.getValueType() != MVT::f128);
3821 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3822 LHS.getValueType() == MVT::bf16) {
3823 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3824 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3825 }
3826 Opcode = AArch64ISD::FCCMP;
3827 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3828 APInt Imm = Const->getAPIntValue();
3829 if (Imm.isNegative() && Imm.sgt(-32)) {
3830 Opcode = AArch64ISD::CCMN;
3831 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3832 }
3833 } else if (isCMN(RHS, CC, DAG)) {
3834 Opcode = AArch64ISD::CCMN;
3835 RHS = RHS.getOperand(1);
3836 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3837 isIntEqualitySetCC(CC)) {
3838 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3839 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3840 Opcode = AArch64ISD::CCMN;
3841 LHS = LHS.getOperand(1);
3842 }
3843 if (Opcode == 0)
3844 Opcode = AArch64ISD::CCMP;
3845
3846 SDValue Condition = getCondCode(DAG, Predicate);
3848 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3849 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3850 return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp);
3851}
3852
3853/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3854/// expressed as a conjunction. See \ref AArch64CCMP.
3855/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3856/// changing the conditions on the SETCC tests.
3857/// (this means we can call emitConjunctionRec() with
3858/// Negate==true on this sub-tree)
3859/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3860/// cannot do the negation naturally. We are required to
3861/// emit the subtree first in this case.
3862/// \param WillNegate Is true if are called when the result of this
3863/// subexpression must be negated. This happens when the
3864/// outer expression is an OR. We can use this fact to know
3865/// that we have a double negation (or (or ...) ...) that
3866/// can be implemented for free.
3867static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3868 bool &MustBeFirst, bool WillNegate,
3869 unsigned Depth = 0) {
3870 if (!Val.hasOneUse())
3871 return false;
3872 unsigned Opcode = Val->getOpcode();
3873 if (Opcode == ISD::SETCC) {
3874 if (Val->getOperand(0).getValueType() == MVT::f128)
3875 return false;
3876 CanNegate = true;
3877 MustBeFirst = false;
3878 return true;
3879 }
3880 // Protect against exponential runtime and stack overflow.
3881 if (Depth > 6)
3882 return false;
3883 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3884 bool IsOR = Opcode == ISD::OR;
3885 SDValue O0 = Val->getOperand(0);
3886 SDValue O1 = Val->getOperand(1);
3887 bool CanNegateL;
3888 bool MustBeFirstL;
3889 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3890 return false;
3891 bool CanNegateR;
3892 bool MustBeFirstR;
3893 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3894 return false;
3895
3896 if (MustBeFirstL && MustBeFirstR)
3897 return false;
3898
3899 if (IsOR) {
3900 // For an OR expression we need to be able to naturally negate at least
3901 // one side or we cannot do the transformation at all.
3902 if (!CanNegateL && !CanNegateR)
3903 return false;
3904 // If we the result of the OR will be negated and we can naturally negate
3905 // the leafs, then this sub-tree as a whole negates naturally.
3906 CanNegate = WillNegate && CanNegateL && CanNegateR;
3907 // If we cannot naturally negate the whole sub-tree, then this must be
3908 // emitted first.
3909 MustBeFirst = !CanNegate;
3910 } else {
3911 assert(Opcode == ISD::AND && "Must be OR or AND");
3912 // We cannot naturally negate an AND operation.
3913 CanNegate = false;
3914 MustBeFirst = MustBeFirstL || MustBeFirstR;
3915 }
3916 return true;
3917 }
3918 return false;
3919}
3920
3921/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3922/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3923/// Tries to transform the given i1 producing node @p Val to a series compare
3924/// and conditional compare operations. @returns an NZCV flags producing node
3925/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3926/// transformation was not possible.
3927/// \p Negate is true if we want this sub-tree being negated just by changing
3928/// SETCC conditions.
3930 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3932 // We're at a tree leaf, produce a conditional comparison operation.
3933 unsigned Opcode = Val->getOpcode();
3934 if (Opcode == ISD::SETCC) {
3935 SDValue LHS = Val->getOperand(0);
3936 SDValue RHS = Val->getOperand(1);
3937 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3938 bool isInteger = LHS.getValueType().isInteger();
3939 if (Negate)
3940 CC = getSetCCInverse(CC, LHS.getValueType());
3941 SDLoc DL(Val);
3942 // Determine OutCC and handle FP special case.
3943 if (isInteger) {
3944 OutCC = changeIntCCToAArch64CC(CC, RHS);
3945 } else {
3946 assert(LHS.getValueType().isFloatingPoint());
3947 AArch64CC::CondCode ExtraCC;
3948 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3949 // Some floating point conditions can't be tested with a single condition
3950 // code. Construct an additional comparison in this case.
3951 if (ExtraCC != AArch64CC::AL) {
3952 SDValue ExtraCmp;
3953 if (!CCOp.getNode())
3954 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3955 else
3956 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3957 ExtraCC, DL, DAG);
3958 CCOp = ExtraCmp;
3959 Predicate = ExtraCC;
3960 }
3961 }
3962
3963 // Produce a normal comparison if we are first in the chain
3964 if (!CCOp)
3965 return emitComparison(LHS, RHS, CC, DL, DAG);
3966 // Otherwise produce a ccmp.
3967 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3968 DAG);
3969 }
3970 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3971
3972 bool IsOR = Opcode == ISD::OR;
3973
3974 SDValue LHS = Val->getOperand(0);
3975 bool CanNegateL;
3976 bool MustBeFirstL;
3977 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3978 assert(ValidL && "Valid conjunction/disjunction tree");
3979 (void)ValidL;
3980
3981 SDValue RHS = Val->getOperand(1);
3982 bool CanNegateR;
3983 bool MustBeFirstR;
3984 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3985 assert(ValidR && "Valid conjunction/disjunction tree");
3986 (void)ValidR;
3987
3988 // Swap sub-tree that must come first to the right side.
3989 if (MustBeFirstL) {
3990 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3991 std::swap(LHS, RHS);
3992 std::swap(CanNegateL, CanNegateR);
3993 std::swap(MustBeFirstL, MustBeFirstR);
3994 }
3995
3996 bool NegateR;
3997 bool NegateAfterR;
3998 bool NegateL;
3999 bool NegateAfterAll;
4000 if (Opcode == ISD::OR) {
4001 // Swap the sub-tree that we can negate naturally to the left.
4002 if (!CanNegateL) {
4003 assert(CanNegateR && "at least one side must be negatable");
4004 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4005 assert(!Negate);
4006 std::swap(LHS, RHS);
4007 NegateR = false;
4008 NegateAfterR = true;
4009 } else {
4010 // Negate the left sub-tree if possible, otherwise negate the result.
4011 NegateR = CanNegateR;
4012 NegateAfterR = !CanNegateR;
4013 }
4014 NegateL = true;
4015 NegateAfterAll = !Negate;
4016 } else {
4017 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
4018 assert(!Negate && "Valid conjunction/disjunction tree");
4019
4020 NegateL = false;
4021 NegateR = false;
4022 NegateAfterR = false;
4023 NegateAfterAll = false;
4024 }
4025
4026 // Emit sub-trees.
4027 AArch64CC::CondCode RHSCC;
4028 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
4029 if (NegateAfterR)
4030 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
4031 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
4032 if (NegateAfterAll)
4033 OutCC = AArch64CC::getInvertedCondCode(OutCC);
4034 return CmpL;
4035}
4036
4037/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
4038/// In some cases this is even possible with OR operations in the expression.
4039/// See \ref AArch64CCMP.
4040/// \see emitConjunctionRec().
4042 AArch64CC::CondCode &OutCC) {
4043 bool DummyCanNegate;
4044 bool DummyMustBeFirst;
4045 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
4046 return SDValue();
4047
4048 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
4049}
4050
4051/// @}
4052
4053/// Returns how profitable it is to fold a comparison's operand's shift and/or
4054/// extension operations.
4056 auto isSupportedExtend = [&](SDValue V) {
4057 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
4058 return true;
4059
4060 if (V.getOpcode() == ISD::AND)
4061 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
4062 uint64_t Mask = MaskCst->getZExtValue();
4063 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
4064 }
4065
4066 return false;
4067 };
4068
4069 if (!Op.hasOneUse())
4070 return 0;
4071
4072 if (isSupportedExtend(Op))
4073 return 1;
4074
4075 unsigned Opc = Op.getOpcode();
4076 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
4077 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4078 uint64_t Shift = ShiftCst->getZExtValue();
4079 if (isSupportedExtend(Op.getOperand(0)))
4080 return (Shift <= 4) ? 2 : 1;
4081 EVT VT = Op.getValueType();
4082 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
4083 return 1;
4084 }
4085
4086 return 0;
4087}
4088
4089// emitComparison() converts comparison with one or negative one to comparison
4090// with 0. Note that this only works for signed comparisons because of how ANDS
4091// works.
4093 // Only works for ANDS and AND.
4094 if (LHS.getOpcode() != ISD::AND && LHS.getOpcode() != AArch64ISD::ANDS)
4095 return false;
4096
4097 if (C.isOne() && (CC == ISD::SETLT || CC == ISD::SETGE)) {
4098 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4099 return true;
4100 }
4101
4102 if (C.isAllOnes() && (CC == ISD::SETLE || CC == ISD::SETGT)) {
4103 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4104 return true;
4105 }
4106
4107 return false;
4108}
4109
4111 SDValue &AArch64cc, SelectionDAG &DAG,
4112 const SDLoc &DL) {
4113 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4114 EVT VT = RHS.getValueType();
4115 APInt C = RHSC->getAPIntValue();
4116 // shouldBeAdjustedToZero is a special case to better fold with
4117 // emitComparison().
4118 if (shouldBeAdjustedToZero(LHS, C, CC)) {
4119 // Adjust the constant to zero.
4120 // CC has already been adjusted.
4121 RHS = DAG.getConstant(0, DL, VT);
4122 } else if (!isLegalCmpImmed(C)) {
4123 unsigned NumImmForC = numberOfInstrToLoadImm(C);
4124 // Constant does not fit, try adjusting it by one?
4125 switch (CC) {
4126 default:
4127 break;
4128 case ISD::SETLT:
4129 case ISD::SETGE:
4130 if (!C.isMinSignedValue()) {
4131 APInt CMinusOne = C - 1;
4132 if (isLegalCmpImmed(CMinusOne) ||
4133 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4134 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4135 RHS = DAG.getConstant(CMinusOne, DL, VT);
4136 }
4137 }
4138 break;
4139 case ISD::SETULT:
4140 case ISD::SETUGE: {
4141 // C is not 0 because it is a legal immediate.
4142 assert(!C.isZero() && "C should not be zero here");
4143 APInt CMinusOne = C - 1;
4144 if (isLegalCmpImmed(CMinusOne) ||
4145 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4146 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4147 RHS = DAG.getConstant(CMinusOne, DL, VT);
4148 }
4149 break;
4150 }
4151 case ISD::SETLE:
4152 case ISD::SETGT:
4153 if (!C.isMaxSignedValue()) {
4154 APInt CPlusOne = C + 1;
4155 if (isLegalCmpImmed(CPlusOne) ||
4156 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4157 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4158 RHS = DAG.getConstant(CPlusOne, DL, VT);
4159 }
4160 }
4161 break;
4162 case ISD::SETULE:
4163 case ISD::SETUGT: {
4164 if (!C.isAllOnes()) {
4165 APInt CPlusOne = C + 1;
4166 if (isLegalCmpImmed(CPlusOne) ||
4167 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4168 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4169 RHS = DAG.getConstant(CPlusOne, DL, VT);
4170 }
4171 }
4172 break;
4173 }
4174 }
4175 }
4176 }
4177
4178 // Comparisons are canonicalized so that the RHS operand is simpler than the
4179 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
4180 // can fold some shift+extend operations on the RHS operand, so swap the
4181 // operands if that can be done.
4182 //
4183 // For example:
4184 // lsl w13, w11, #1
4185 // cmp w13, w12
4186 // can be turned into:
4187 // cmp w12, w11, lsl #1
4188 if (!isa<ConstantSDNode>(RHS) || !isLegalCmpImmed(RHS->getAsAPIntVal())) {
4189 bool LHSIsCMN = isCMN(LHS, CC, DAG);
4190 bool RHSIsCMN = isCMN(RHS, CC, DAG);
4191 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
4192 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
4193
4194 if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
4195 getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
4196 std::swap(LHS, RHS);
4198 }
4199 }
4200
4201 SDValue Cmp;
4203 if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
4205
4206 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
4207 // For the i8 operand, the largest immediate is 255, so this can be easily
4208 // encoded in the compare instruction. For the i16 operand, however, the
4209 // largest immediate cannot be encoded in the compare.
4210 // Therefore, use a sign extending load and cmn to avoid materializing the
4211 // -1 constant. For example,
4212 // movz w1, #65535
4213 // ldrh w0, [x0, #0]
4214 // cmp w0, w1
4215 // >
4216 // ldrsh w0, [x0, #0]
4217 // cmn w0, #1
4218 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
4219 // if and only if (sext LHS) == (sext RHS). The checks are in place to
4220 // ensure both the LHS and RHS are truly zero extended and to make sure the
4221 // transformation is profitable.
4222 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
4223 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
4224 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
4225 LHS.getNode()->hasNUsesOfValue(1, 0)) {
4226 int16_t ValueofRHS = RHS->getAsZExtVal();
4227 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
4228 SDValue SExt =
4229 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, LHS.getValueType(), LHS,
4230 DAG.getValueType(MVT::i16));
4231 Cmp = emitComparison(
4232 SExt, DAG.getSignedConstant(ValueofRHS, DL, RHS.getValueType()), CC,
4233 DL, DAG);
4235 }
4236 }
4237
4238 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
4239 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
4240 if ((CC == ISD::SETNE) ^ RHSC->isZero())
4242 }
4243 }
4244 }
4245
4246 if (!Cmp) {
4247 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
4249 }
4250 AArch64cc = getCondCode(DAG, AArch64CC);
4251 return Cmp;
4252}
4253
4254static std::pair<SDValue, SDValue>
4256 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
4257 "Unsupported value type");
4258 SDValue Value, Overflow;
4259 SDLoc DL(Op);
4260 SDValue LHS = Op.getOperand(0);
4261 SDValue RHS = Op.getOperand(1);
4262 unsigned Opc = 0;
4263 switch (Op.getOpcode()) {
4264 default:
4265 llvm_unreachable("Unknown overflow instruction!");
4266 case ISD::SADDO:
4267 Opc = AArch64ISD::ADDS;
4268 CC = AArch64CC::VS;
4269 break;
4270 case ISD::UADDO:
4271 Opc = AArch64ISD::ADDS;
4272 CC = AArch64CC::HS;
4273 break;
4274 case ISD::SSUBO:
4275 Opc = AArch64ISD::SUBS;
4276 CC = AArch64CC::VS;
4277 break;
4278 case ISD::USUBO:
4279 Opc = AArch64ISD::SUBS;
4280 CC = AArch64CC::LO;
4281 break;
4282 // Multiply needs a little bit extra work.
4283 case ISD::SMULO:
4284 case ISD::UMULO: {
4285 CC = AArch64CC::NE;
4286 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4287 if (Op.getValueType() == MVT::i32) {
4288 // Extend to 64-bits, then perform a 64-bit multiply.
4289 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4290 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4291 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4292 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4293 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4294
4295 // Check that the result fits into a 32-bit integer.
4296 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4297 if (IsSigned) {
4298 // cmp xreg, wreg, sxtw
4299 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4300 Overflow =
4301 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4302 } else {
4303 // tst xreg, #0xffffffff00000000
4304 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4305 Overflow =
4306 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4307 }
4308 break;
4309 }
4310 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4311 // For the 64 bit multiply
4312 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4313 if (IsSigned) {
4314 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4315 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4316 DAG.getConstant(63, DL, MVT::i64));
4317 // It is important that LowerBits is last, otherwise the arithmetic
4318 // shift will not be folded into the compare (SUBS).
4319 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4320 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4321 .getValue(1);
4322 } else {
4323 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4324 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4325 Overflow =
4326 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4327 DAG.getConstant(0, DL, MVT::i64),
4328 UpperBits).getValue(1);
4329 }
4330 break;
4331 }
4332 } // switch (...)
4333
4334 if (Opc) {
4335 SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT);
4336
4337 // Emit the AArch64 operation with overflow check.
4338 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4339 Overflow = Value.getValue(1);
4340 }
4341 return std::make_pair(Value, Overflow);
4342}
4343
4344SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4345 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4346 !Subtarget->isNeonAvailable()))
4347 return LowerToScalableOp(Op, DAG);
4348
4349 SDValue Sel = Op.getOperand(0);
4350 SDValue Other = Op.getOperand(1);
4351 SDLoc DL(Sel);
4352
4353 // If the operand is an overflow checking operation, invert the condition
4354 // code and kill the Not operation. I.e., transform:
4355 // (xor (overflow_op_bool, 1))
4356 // -->
4357 // (csel 1, 0, invert(cc), overflow_op_bool)
4358 // ... which later gets transformed to just a cset instruction with an
4359 // inverted condition code, rather than a cset + eor sequence.
4361 // Only lower legal XALUO ops.
4363 return SDValue();
4364
4365 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4366 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4368 SDValue Value, Overflow;
4369 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4370 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4371 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
4372 CCVal, Overflow);
4373 }
4374 // If neither operand is a SELECT_CC, give up.
4375 if (Sel.getOpcode() != ISD::SELECT_CC)
4376 std::swap(Sel, Other);
4377 if (Sel.getOpcode() != ISD::SELECT_CC)
4378 return Op;
4379
4380 // The folding we want to perform is:
4381 // (xor x, (select_cc a, b, cc, 0, -1) )
4382 // -->
4383 // (csel x, (xor x, -1), cc ...)
4384 //
4385 // The latter will get matched to a CSINV instruction.
4386
4387 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4388 SDValue LHS = Sel.getOperand(0);
4389 SDValue RHS = Sel.getOperand(1);
4390 SDValue TVal = Sel.getOperand(2);
4391 SDValue FVal = Sel.getOperand(3);
4392
4393 // FIXME: This could be generalized to non-integer comparisons.
4394 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4395 return Op;
4396
4397 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4398 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4399
4400 // The values aren't constants, this isn't the pattern we're looking for.
4401 if (!CFVal || !CTVal)
4402 return Op;
4403
4404 // We can commute the SELECT_CC by inverting the condition. This
4405 // might be needed to make this fit into a CSINV pattern.
4406 if (CTVal->isAllOnes() && CFVal->isZero()) {
4407 std::swap(TVal, FVal);
4408 std::swap(CTVal, CFVal);
4409 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4410 }
4411
4412 // If the constants line up, perform the transform!
4413 if (CTVal->isZero() && CFVal->isAllOnes()) {
4414 SDValue CCVal;
4415 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
4416
4417 FVal = Other;
4418 TVal = DAG.getNode(ISD::XOR, DL, Other.getValueType(), Other,
4419 DAG.getAllOnesConstant(DL, Other.getValueType()));
4420
4421 return DAG.getNode(AArch64ISD::CSEL, DL, Sel.getValueType(), FVal, TVal,
4422 CCVal, Cmp);
4423 }
4424
4425 return Op;
4426}
4427
4428// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4429// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4430// sets 'C' bit to 0.
4432 SDLoc DL(Value);
4433 EVT VT = Value.getValueType();
4434 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4435 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4436 SDValue Cmp =
4437 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1);
4438 return Cmp.getValue(1);
4439}
4440
4441// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4442// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4444 bool Invert) {
4445 assert(Glue.getResNo() == 1);
4446 SDLoc DL(Glue);
4447 SDValue Zero = DAG.getConstant(0, DL, VT);
4448 SDValue One = DAG.getConstant(1, DL, VT);
4450 SDValue CC = getCondCode(DAG, Cond);
4451 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4452}
4453
4454// Value is 1 if 'V' bit of NZCV is 1, else 0
4456 assert(Glue.getResNo() == 1);
4457 SDLoc DL(Glue);
4458 SDValue Zero = DAG.getConstant(0, DL, VT);
4459 SDValue One = DAG.getConstant(1, DL, VT);
4461 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4462}
4463
4464// This lowering is inefficient, but it will get cleaned up by
4465// `foldOverflowCheck`
4467 unsigned Opcode, bool IsSigned) {
4468 EVT VT0 = Op.getValue(0).getValueType();
4469 EVT VT1 = Op.getValue(1).getValueType();
4470
4471 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4472 return SDValue();
4473
4474 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4475 SDValue OpLHS = Op.getOperand(0);
4476 SDValue OpRHS = Op.getOperand(1);
4477 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4478
4479 SDLoc DL(Op);
4480
4481 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS,
4482 OpRHS, OpCarryIn);
4483
4484 SDValue OutFlag =
4485 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4486 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4487
4488 return DAG.getMergeValues({Sum, OutFlag}, DL);
4489}
4490
4492 // Let legalize expand this if it isn't a legal type yet.
4493 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4494 return SDValue();
4495
4496 SDLoc DL(Op);
4498 // The actual operation that sets the overflow or carry flag.
4499 SDValue Value, Overflow;
4500 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4501
4502 // We use 0 and 1 as false and true values.
4503 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4504 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4505
4506 // We use an inverted condition, because the conditional select is inverted
4507 // too. This will allow it to be selected to a single instruction:
4508 // CSINC Wd, WZR, WZR, invert(cond).
4509 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4510 Overflow =
4511 DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);
4512
4513 return DAG.getMergeValues({Value, Overflow}, DL);
4514}
4515
4516// Prefetch operands are:
4517// 1: Address to prefetch
4518// 2: bool isWrite
4519// 3: int locality (0 = no locality ... 3 = extreme locality)
4520// 4: bool isDataCache
4522 SDLoc DL(Op);
4523 unsigned IsWrite = Op.getConstantOperandVal(2);
4524 unsigned Locality = Op.getConstantOperandVal(3);
4525 unsigned IsData = Op.getConstantOperandVal(4);
4526
4527 bool IsStream = !Locality;
4528 // When the locality number is set
4529 if (Locality) {
4530 // The front-end should have filtered out the out-of-range values
4531 assert(Locality <= 3 && "Prefetch locality out-of-range");
4532 // The locality degree is the opposite of the cache speed.
4533 // Put the number the other way around.
4534 // The encoding starts at 0 for level 1
4535 Locality = 3 - Locality;
4536 }
4537
4538 // built the mask value encoding the expected behavior.
4539 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4540 (!IsData << 3) | // IsDataCache bit
4541 (Locality << 1) | // Cache level bits
4542 (unsigned)IsStream; // Stream bit
4543 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4544 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4545 Op.getOperand(1));
4546}
4547
4548// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4549// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4550// (AND X Y) Z which produces a better opt with EmitComparison
4552 SelectionDAG &DAG, const SDLoc DL) {
4553 if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4554 ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4556 if (LHSConstOp && RHSConst) {
4557 uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4558 uint64_t RHSConstant = RHSConst->getZExtValue();
4559 if (isPowerOf2_64(RHSConstant)) {
4560 uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4561 LHS =
4562 DAG.getNode(ISD::AND, DL, LHS.getValueType(), LHS.getOperand(0),
4563 DAG.getConstant(NewMaskValue, DL, LHS.getValueType()));
4564 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4565 CC = ISD::SETEQ;
4566 }
4567 }
4568 }
4569}
4570
4571SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4572 SelectionDAG &DAG) const {
4573 EVT VT = Op.getValueType();
4574 if (VT.isScalableVector()) {
4575 SDValue SrcVal = Op.getOperand(0);
4576
4577 if (VT == MVT::nxv2f64 && SrcVal.getValueType() == MVT::nxv2bf16) {
4578 // Break conversion in two with the first part converting to f32 and the
4579 // second using native f32->VT instructions.
4580 SDLoc DL(Op);
4581 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
4582 DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
4583 }
4584
4585 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4586 }
4587
4588 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4589 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4590
4591 bool IsStrict = Op->isStrictFPOpcode();
4592 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
4593 EVT Op0VT = Op0.getValueType();
4594 if (VT == MVT::f64) {
4595 // FP16->FP32 extends are legal for v32 and v4f32.
4596 if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4597 return Op;
4598 // Split bf16->f64 extends into two fpextends.
4599 if (Op0VT == MVT::bf16 && IsStrict) {
4600 SDValue Ext1 =
4601 DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
4602 {Op0, Op.getOperand(0)});
4603 return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
4604 {Ext1, Ext1.getValue(1)});
4605 }
4606 if (Op0VT == MVT::bf16)
4607 return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
4608 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
4609 return SDValue();
4610 }
4611
4612 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4613 return SDValue();
4614}
4615
4616SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4617 SelectionDAG &DAG) const {
4618 EVT VT = Op.getValueType();
4619 bool IsStrict = Op->isStrictFPOpcode();
4620 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4621 EVT SrcVT = SrcVal.getValueType();
4622 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4623
4624 if (VT.isScalableVector()) {
4625 // Let common code split the operation.
4626 if (SrcVT == MVT::nxv8f32)
4627 return Op;
4628
4629 if (VT.getScalarType() != MVT::bf16)
4630 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4631
4632 SDLoc DL(Op);
4633 constexpr EVT I32 = MVT::nxv4i32;
4634 auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
4635
4636 SDValue NaN;
4637 SDValue Narrow;
4638
4639 if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4640 if (Subtarget->hasBF16())
4641 return LowerToPredicatedOp(Op, DAG,
4642 AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4643
4644 Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
4645
4646 // Set the quiet bit.
4647 if (!DAG.isKnownNeverSNaN(SrcVal))
4648 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
4649 } else if (SrcVT == MVT::nxv2f64 &&
4650 (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4651 // Round to float without introducing rounding errors and try again.
4652 SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);
4653 Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,
4654 Pg, SrcVal, DAG.getUNDEF(MVT::nxv2f32));
4655
4657 if (IsStrict)
4658 NewOps.push_back(Op.getOperand(0));
4659 NewOps.push_back(Narrow);
4660 NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));
4661 return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());
4662 } else
4663 return SDValue();
4664
4665 if (!Trunc) {
4666 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4667 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
4668 SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
4669 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4670 }
4671
4672 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4673 // 0x80000000.
4674 if (NaN) {
4675 EVT I1 = I32.changeElementType(MVT::i1);
4676 EVT CondVT = VT.changeElementType(MVT::i1);
4677 SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
4678 IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
4679 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4680 }
4681
4682 // Now that we have rounded, shift the bits into position.
4683 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4684 return getSVESafeBitCast(VT, Narrow, DAG);
4685 }
4686
4687 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4688 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4689
4690 // Expand cases where the result type is BF16 but we don't have hardware
4691 // instructions to lower it.
4692 if (VT.getScalarType() == MVT::bf16 &&
4693 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4694 Subtarget->hasBF16())) {
4695 SDLoc DL(Op);
4696 SDValue Narrow = SrcVal;
4697 SDValue NaN;
4698 EVT I32 = SrcVT.changeElementType(MVT::i32);
4699 EVT F32 = SrcVT.changeElementType(MVT::f32);
4700 if (SrcVT.getScalarType() == MVT::f32) {
4701 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4702 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4703 if (!NeverSNaN) {
4704 // Set the quiet bit.
4705 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow,
4706 DAG.getConstant(0x400000, DL, I32));
4707 }
4708 } else if (SrcVT.getScalarType() == MVT::f64) {
4709 Narrow = DAG.getNode(AArch64ISD::FCVTXN, DL, F32, Narrow);
4710 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4711 } else {
4712 return SDValue();
4713 }
4714 if (!Trunc) {
4715 SDValue One = DAG.getConstant(1, DL, I32);
4716 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4717 DAG.getShiftAmountConstant(16, I32, DL));
4718 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, One);
4719 SDValue RoundingBias =
4720 DAG.getNode(ISD::ADD, DL, I32, DAG.getConstant(0x7fff, DL, I32), Lsb);
4721 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4722 }
4723
4724 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4725 // 0x80000000.
4726 if (NaN) {
4727 SDValue IsNaN = DAG.getSetCC(
4728 DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4729 SrcVal, SrcVal, ISD::SETUO);
4730 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4731 }
4732
4733 // Now that we have rounded, shift the bits into position.
4734 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4735 DAG.getShiftAmountConstant(16, I32, DL));
4736 if (VT.isVector()) {
4737 EVT I16 = I32.changeVectorElementType(MVT::i16);
4738 Narrow = DAG.getNode(ISD::TRUNCATE, DL, I16, Narrow);
4739 return DAG.getNode(ISD::BITCAST, DL, VT, Narrow);
4740 }
4741 Narrow = DAG.getNode(ISD::BITCAST, DL, F32, Narrow);
4742 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Narrow);
4743 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, DL)
4744 : Result;
4745 }
4746
4747 if (SrcVT != MVT::f128) {
4748 // Expand cases where the input is a vector bigger than NEON.
4750 return SDValue();
4751
4752 // It's legal except when f128 is involved
4753 return Op;
4754 }
4755
4756 return SDValue();
4757}
4758
4759SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4760 SelectionDAG &DAG) const {
4761 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4762 // Any additional optimization in this function should be recorded
4763 // in the cost tables.
4764 bool IsStrict = Op->isStrictFPOpcode();
4765 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4766 EVT VT = Op.getValueType();
4767
4768 assert(!(IsStrict && VT.isScalableVector()) &&
4769 "Unimplemented SVE support for STRICT_FP_to_INT!");
4770
4771 // f16 conversions are promoted to f32 when full fp16 is not supported.
4772 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4773 InVT.getVectorElementType() == MVT::bf16) {
4774 EVT NewVT = VT.changeElementType(MVT::f32);
4775 SDLoc DL(Op);
4776 if (IsStrict) {
4777 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {NewVT, MVT::Other},
4778 {Op.getOperand(0), Op.getOperand(1)});
4779 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4780 {Ext.getValue(1), Ext.getValue(0)});
4781 }
4782 return DAG.getNode(
4783 Op.getOpcode(), DL, Op.getValueType(),
4784 DAG.getNode(ISD::FP_EXTEND, DL, NewVT, Op.getOperand(0)));
4785 }
4786
4787 if (VT.isScalableVector()) {
4788 if (VT.getVectorElementType() == MVT::i1) {
4789 SDLoc DL(Op);
4790 EVT CvtVT = getPromotedVTForPredicate(VT);
4791 SDValue Cvt = DAG.getNode(Op.getOpcode(), DL, CvtVT, Op.getOperand(0));
4792 SDValue Zero = DAG.getConstant(0, DL, CvtVT);
4793 return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE);
4794 }
4795
4796 // Let common code split the operation.
4797 if (InVT == MVT::nxv8f32)
4798 return Op;
4799
4800 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4801 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4802 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4803 return LowerToPredicatedOp(Op, DAG, Opcode);
4804 }
4805
4806 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4807 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4808 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4809
4810 uint64_t VTSize = VT.getFixedSizeInBits();
4811 uint64_t InVTSize = InVT.getFixedSizeInBits();
4812 if (VTSize < InVTSize) {
4813 SDLoc DL(Op);
4814 if (IsStrict) {
4816 SDValue Cv = DAG.getNode(Op.getOpcode(), DL, {InVT, MVT::Other},
4817 {Op.getOperand(0), Op.getOperand(1)});
4818 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4819 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, DL);
4820 }
4821 SDValue Cv =
4822 DAG.getNode(Op.getOpcode(), DL, InVT.changeVectorElementTypeToInteger(),
4823 Op.getOperand(0));
4824 return DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4825 }
4826
4827 if (VTSize > InVTSize) {
4828 SDLoc DL(Op);
4829 MVT ExtVT =
4832 if (IsStrict) {
4833 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {ExtVT, MVT::Other},
4834 {Op.getOperand(0), Op.getOperand(1)});
4835 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4836 {Ext.getValue(1), Ext.getValue(0)});
4837 }
4838 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, ExtVT, Op.getOperand(0));
4839 return DAG.getNode(Op.getOpcode(), DL, VT, Ext);
4840 }
4841
4842 // Use a scalar operation for conversions between single-element vectors of
4843 // the same size.
4844 if (InVT.getVectorNumElements() == 1) {
4845 SDLoc DL(Op);
4846 SDValue Extract = DAG.getNode(
4848 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, DL, MVT::i64));
4849 EVT ScalarVT = VT.getScalarType();
4850 if (IsStrict)
4851 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
4852 {Op.getOperand(0), Extract});
4853 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
4854 }
4855
4856 // Type changing conversions are illegal.
4857 return Op;
4858}
4859
4860SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4861 SelectionDAG &DAG) const {
4862 bool IsStrict = Op->isStrictFPOpcode();
4863 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4864
4865 if (SrcVal.getValueType().isVector())
4866 return LowerVectorFP_TO_INT(Op, DAG);
4867
4868 // f16 conversions are promoted to f32 when full fp16 is not supported.
4869 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4870 SrcVal.getValueType() == MVT::bf16) {
4871 SDLoc DL(Op);
4872 if (IsStrict) {
4873 SDValue Ext =
4874 DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
4875 {Op.getOperand(0), SrcVal});
4876 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
4877 {Ext.getValue(1), Ext.getValue(0)});
4878 }
4879 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(),
4880 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, SrcVal));
4881 }
4882
4883 if (SrcVal.getValueType() != MVT::f128) {
4884 // It's legal except when f128 is involved
4885 return Op;
4886 }
4887
4888 return SDValue();
4889}
4890
4891SDValue
4892AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4893 SelectionDAG &DAG) const {
4894 // AArch64 FP-to-int conversions saturate to the destination element size, so
4895 // we can lower common saturating conversions to simple instructions.
4896 SDValue SrcVal = Op.getOperand(0);
4897 EVT SrcVT = SrcVal.getValueType();
4898 EVT DstVT = Op.getValueType();
4899 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4900
4901 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4902 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4903 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4904 assert(SatWidth <= DstElementWidth &&
4905 "Saturation width cannot exceed result width");
4906
4907 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4908 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4909 // types, so this is hard to reach.
4910 if (DstVT.isScalableVector())
4911 return SDValue();
4912
4913 EVT SrcElementVT = SrcVT.getVectorElementType();
4914
4915 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4916 SDLoc DL(Op);
4917 SDValue SrcVal2;
4918 if ((SrcElementVT == MVT::f16 &&
4919 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4920 SrcElementVT == MVT::bf16) {
4921 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4922 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
4923 // If we are extending to a v8f32, split into two v4f32 to produce legal
4924 // types.
4925 if (F32VT.getSizeInBits() > 128) {
4926 std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
4927 F32VT = F32VT.getHalfNumVectorElementsVT();
4928 }
4929 SrcVT = F32VT;
4930 SrcElementVT = MVT::f32;
4931 SrcElementWidth = 32;
4932 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4933 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4934 return SDValue();
4935
4936 // Expand to f64 if we are saturating to i64, to help keep the lanes the same
4937 // width and produce a fcvtzu.
4938 if (SatWidth == 64 && SrcElementWidth < 64) {
4939 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
4940 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
4941 SrcVT = F64VT;
4942 SrcElementVT = MVT::f64;
4943 SrcElementWidth = 64;
4944 }
4945 // Cases that we can emit directly.
4946 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
4947 SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4948 DAG.getValueType(DstVT.getScalarType()));
4949 if (SrcVal2) {
4950 SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
4951 DAG.getValueType(DstVT.getScalarType()));
4952 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
4953 }
4954 return Res;
4955 }
4956
4957 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4958 // result. This is only valid if the legal cvt is larger than the saturate
4959 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4960 // (at least until sqxtn is selected).
4961 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4962 return SDValue();
4963
4964 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4965 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4966 DAG.getValueType(IntVT.getScalarType()));
4967 SDValue NativeCvt2 =
4968 SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
4969 DAG.getValueType(IntVT.getScalarType()))
4970 : SDValue();
4971 SDValue Sat, Sat2;
4972 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4973 SDValue MinC = DAG.getConstant(
4974 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4975 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4976 SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4977 SDValue MaxC = DAG.getConstant(
4978 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4979 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4980 Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
4981 } else {
4982 SDValue MinC = DAG.getConstant(
4983 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4984 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4985 Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4986 }
4987
4988 if (SrcVal2)
4989 Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
4991 Sat, Sat2);
4992
4993 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4994}
4995
4996SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4997 SelectionDAG &DAG) const {
4998 // AArch64 FP-to-int conversions saturate to the destination register size, so
4999 // we can lower common saturating conversions to simple instructions.
5000 SDValue SrcVal = Op.getOperand(0);
5001 EVT SrcVT = SrcVal.getValueType();
5002
5003 if (SrcVT.isVector())
5004 return LowerVectorFP_TO_INT_SAT(Op, DAG);
5005
5006 EVT DstVT = Op.getValueType();
5007 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5008 uint64_t SatWidth = SatVT.getScalarSizeInBits();
5009 uint64_t DstWidth = DstVT.getScalarSizeInBits();
5010 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
5011
5012 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
5013 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
5014 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
5015 SrcVT = MVT::f32;
5016 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
5017 SrcVT != MVT::bf16)
5018 return SDValue();
5019
5020 SDLoc DL(Op);
5021 // Cases that we can emit directly.
5022 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
5023 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
5024 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
5025 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
5026 DAG.getValueType(DstVT));
5027
5028 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
5029 // result. This is only valid if the legal cvt is larger than the saturate
5030 // width.
5031 if (DstWidth < SatWidth)
5032 return SDValue();
5033
5034 if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) {
5035 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5036 SDValue CVTf32 =
5037 DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal);
5038 SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32);
5039 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast,
5040 DAG.getValueType(SatVT));
5041 }
5042 SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal);
5043 return DAG.getBitcast(DstVT, CVTf32);
5044 }
5045
5046 SDValue NativeCvt =
5047 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
5048 SDValue Sat;
5049 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5050 SDValue MinC = DAG.getConstant(
5051 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
5052 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
5053 SDValue MaxC = DAG.getConstant(
5054 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
5055 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
5056 } else {
5057 SDValue MinC = DAG.getConstant(
5058 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
5059 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
5060 }
5061
5062 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5063}
5064
5065SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
5066 SelectionDAG &DAG) const {
5067 EVT VT = Op.getValueType();
5068 SDValue Src = Op.getOperand(0);
5069 SDLoc DL(Op);
5070
5071 assert(VT.isVector() && "Expected vector type");
5072
5073 EVT CastVT =
5074 VT.changeVectorElementType(Src.getValueType().getVectorElementType());
5075
5076 // Round the floating-point value into a floating-point register with the
5077 // current rounding mode.
5078 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
5079
5080 // Truncate the rounded floating point to an integer.
5081 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
5083}
5084
5085SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
5086 SelectionDAG &DAG) const {
5087 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
5088 // Any additional optimization in this function should be recorded
5089 // in the cost tables.
5090 bool IsStrict = Op->isStrictFPOpcode();
5091 EVT VT = Op.getValueType();
5092 SDLoc DL(Op);
5093 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
5094 EVT InVT = In.getValueType();
5095 unsigned Opc = Op.getOpcode();
5096 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
5097
5098 assert(!(IsStrict && VT.isScalableVector()) &&
5099 "Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");
5100
5101 // NOTE: i1->bf16 does not require promotion to f32.
5102 if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) {
5103 SDValue FalseVal = DAG.getConstantFP(0.0, DL, VT);
5104 SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, DL, VT)
5105 : DAG.getConstantFP(1.0, DL, VT);
5106 return DAG.getNode(ISD::VSELECT, DL, VT, In, TrueVal, FalseVal);
5107 }
5108
5109 // Promote bf16 conversions to f32.
5110 if (VT.getVectorElementType() == MVT::bf16) {
5111 EVT F32 = VT.changeElementType(MVT::f32);
5112 if (IsStrict) {
5113 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {F32, MVT::Other},
5114 {Op.getOperand(0), In});
5115 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5116 {Op.getValueType(), MVT::Other},
5117 {Val.getValue(1), Val.getValue(0),
5118 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5119 }
5120 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5121 DAG.getNode(Op.getOpcode(), DL, F32, In),
5122 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5123 }
5124
5125 if (VT.isScalableVector()) {
5126 // Let common code split the operation.
5127 if (VT == MVT::nxv8f32)
5128 return Op;
5129
5130 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
5131 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
5132 return LowerToPredicatedOp(Op, DAG, Opcode);
5133 }
5134
5135 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
5136 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
5137 return LowerFixedLengthIntToFPToSVE(Op, DAG);
5138
5139 uint64_t VTSize = VT.getFixedSizeInBits();
5140 uint64_t InVTSize = InVT.getFixedSizeInBits();
5141 if (VTSize < InVTSize) {
5142 // AArch64 doesn't have a direct vector instruction to convert
5143 // fixed point to floating point AND narrow it at the same time.
5144 // Additional rounding when the target is f32/f64 causes double
5145 // rounding issues. Conversion to f16 is fine due to narrow width.
5146 bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;
5147 bool IsTargetf16 = false;
5148 if (Op.hasOneUse() &&
5149 Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
5150 // Some vector types are split during legalization into half, followed by
5151 // concatenation, followed by rounding to the original vector type. If we
5152 // end up resolving to f16 type, we shouldn't worry about rounding errors.
5153 SDNode *U = *Op->user_begin();
5154 if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
5155 EVT TmpVT = U->user_begin()->getValueType(0);
5156 if (TmpVT.getScalarType() == MVT::f16)
5157 IsTargetf16 = true;
5158 }
5159 }
5160
5161 if (IsTargetf32 && !IsTargetf16) {
5162 return !IsStrict ? DAG.UnrollVectorOp(Op.getNode()) : SDValue();
5163 }
5164
5165 MVT CastVT =
5167 InVT.getVectorNumElements());
5168 if (IsStrict) {
5169 In = DAG.getNode(Opc, DL, {CastVT, MVT::Other}, {Op.getOperand(0), In});
5170 return DAG.getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
5171 {In.getValue(1), In.getValue(0),
5172 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5173 }
5174 In = DAG.getNode(Opc, DL, CastVT, In);
5175 return DAG.getNode(ISD::FP_ROUND, DL, VT, In,
5176 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5177 }
5178
5179 if (VTSize > InVTSize) {
5180 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5181 EVT CastVT = VT.changeVectorElementTypeToInteger();
5182 In = DAG.getNode(CastOpc, DL, CastVT, In);
5183 if (IsStrict)
5184 return DAG.getNode(Opc, DL, {VT, MVT::Other}, {Op.getOperand(0), In});
5185 return DAG.getNode(Opc, DL, VT, In);
5186 }
5187
5188 // Use a scalar operation for conversions between single-element vectors of
5189 // the same size.
5190 if (VT.getVectorNumElements() == 1) {
5191 SDValue Extract =
5193 DAG.getConstant(0, DL, MVT::i64));
5194 EVT ScalarVT = VT.getScalarType();
5195 if (IsStrict)
5196 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
5197 {Op.getOperand(0), Extract});
5198 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
5199 }
5200
5201 return Op;
5202}
5203
5204SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
5205 SelectionDAG &DAG) const {
5206 if (Op.getValueType().isVector())
5207 return LowerVectorINT_TO_FP(Op, DAG);
5208
5209 bool IsStrict = Op->isStrictFPOpcode();
5210 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5211
5212 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
5213 Op->getOpcode() == ISD::SINT_TO_FP;
5214
5215 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
5216 SDLoc DL(Op);
5217 if (IsStrict) {
5218 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {PromoteVT, MVT::Other},
5219 {Op.getOperand(0), SrcVal});
5220 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5221 {Op.getValueType(), MVT::Other},
5222 {Val.getValue(1), Val.getValue(0),
5223 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5224 }
5225 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5226 DAG.getNode(Op.getOpcode(), DL, PromoteVT, SrcVal),
5227 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5228 };
5229
5230 if (Op.getValueType() == MVT::bf16) {
5231 unsigned MaxWidth = IsSigned
5232 ? DAG.ComputeMaxSignificantBits(SrcVal)
5233 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
5234 // bf16 conversions are promoted to f32 when converting from i16.
5235 if (MaxWidth <= 24) {
5236 return IntToFpViaPromotion(MVT::f32);
5237 }
5238
5239 // bf16 conversions are promoted to f64 when converting from i32.
5240 if (MaxWidth <= 53) {
5241 return IntToFpViaPromotion(MVT::f64);
5242 }
5243
5244 // We need to be careful about i64 -> bf16.
5245 // Consider an i32 22216703.
5246 // This number cannot be represented exactly as an f32 and so a itofp will
5247 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
5248 // However, the correct bf16 was supposed to be 22151168.0
5249 // We need to use sticky rounding to get this correct.
5250 if (SrcVal.getValueType() == MVT::i64) {
5251 SDLoc DL(Op);
5252 // This algorithm is equivalent to the following:
5253 // uint64_t SrcHi = SrcVal & ~0xfffull;
5254 // uint64_t SrcLo = SrcVal & 0xfffull;
5255 // uint64_t Highest = SrcVal >> 53;
5256 // bool HasHighest = Highest != 0;
5257 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
5258 // double Rounded = static_cast<double>(ToRound);
5259 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
5260 // uint64_t HasLo = SrcLo != 0;
5261 // bool NeedsAdjustment = HasHighest & HasLo;
5262 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
5263 // double Adjusted = std::bit_cast<double>(AdjustedBits);
5264 // return static_cast<__bf16>(Adjusted);
5265 //
5266 // Essentially, what happens is that SrcVal either fits perfectly in a
5267 // double-precision value or it is too big. If it is sufficiently small,
5268 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
5269 // ensure that u64 -> double has no rounding error by only using the 52
5270 // MSB of the input. The low order bits will get merged into a sticky bit
5271 // which will avoid issues incurred by double rounding.
5272
5273 // Signed conversion is more or less like so:
5274 // copysign((__bf16)abs(SrcVal), SrcVal)
5275 SDValue SignBit;
5276 if (IsSigned) {
5277 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5278 DAG.getConstant(1ull << 63, DL, MVT::i64));
5279 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
5280 }
5281 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5282 DAG.getConstant(~0xfffull, DL, MVT::i64));
5283 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5284 DAG.getConstant(0xfffull, DL, MVT::i64));
5286 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
5287 DAG.getShiftAmountConstant(53, MVT::i64, DL));
5288 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
5289 SDValue ToRound =
5290 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
5291 SDValue Rounded =
5292 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
5293 {Op.getOperand(0), ToRound})
5294 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
5295
5296 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
5297 if (SignBit) {
5298 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
5299 }
5300
5301 SDValue HasHighest = DAG.getSetCC(
5302 DL,
5303 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5304 Highest, Zero64, ISD::SETNE);
5305
5306 SDValue HasLo = DAG.getSetCC(
5307 DL,
5308 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5309 SrcLo, Zero64, ISD::SETNE);
5310
5311 SDValue NeedsAdjustment =
5312 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
5313 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
5314
5315 SDValue AdjustedBits =
5316 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
5317 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
5318 return IsStrict
5319 ? DAG.getNode(
5321 {Op.getValueType(), MVT::Other},
5322 {Rounded.getValue(1), Adjusted,
5323 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})
5324 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
5325 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5326 }
5327 }
5328
5329 // f16 conversions are promoted to f32 when full fp16 is not supported.
5330 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5331 return IntToFpViaPromotion(MVT::f32);
5332 }
5333
5334 // i128 conversions are libcalls.
5335 if (SrcVal.getValueType() == MVT::i128)
5336 return SDValue();
5337
5338 // Other conversions are legal, unless it's to the completely software-based
5339 // fp128.
5340 if (Op.getValueType() != MVT::f128)
5341 return Op;
5342 return SDValue();
5343}
5344
5345SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
5346 SelectionDAG &DAG) const {
5347 // For iOS, we want to call an alternative entry point: __sincos_stret,
5348 // which returns the values in two S / D registers.
5349 SDLoc DL(Op);
5350 SDValue Arg = Op.getOperand(0);
5351 EVT ArgVT = Arg.getValueType();
5352 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
5353
5355 Args.emplace_back(Arg, ArgTy);
5356
5357 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
5358 : RTLIB::SINCOS_STRET_F32;
5359 const char *LibcallName = getLibcallName(LC);
5360 SDValue Callee =
5361 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
5362
5363 StructType *RetTy = StructType::get(ArgTy, ArgTy);
5364 TargetLowering::CallLoweringInfo CLI(DAG);
5366 CLI.setDebugLoc(DL)
5367 .setChain(DAG.getEntryNode())
5368 .setLibCallee(CC, RetTy, Callee, std::move(Args));
5369
5370 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5371 return CallResult.first;
5372}
5373
5374static MVT getSVEContainerType(EVT ContentTy);
5375
5376SDValue
5377AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
5378 SelectionDAG &DAG) const {
5379 SDLoc DL(Op);
5380 uint64_t EltSize = Op.getConstantOperandVal(2);
5381 EVT VT = Op.getValueType();
5382 switch (EltSize) {
5383 case 1:
5384 if (VT != MVT::v16i8 && VT != MVT::nxv16i1)
5385 return SDValue();
5386 break;
5387 case 2:
5388 if (VT != MVT::v8i8 && VT != MVT::nxv8i1)
5389 return SDValue();
5390 break;
5391 case 4:
5392 if (VT != MVT::v4i16 && VT != MVT::nxv4i1)
5393 return SDValue();
5394 break;
5395 case 8:
5396 if (VT != MVT::v2i32 && VT != MVT::nxv2i1)
5397 return SDValue();
5398 break;
5399 default:
5400 // Other element sizes are incompatible with whilewr/rw, so expand instead
5401 return SDValue();
5402 }
5403
5404 SDValue PtrA = Op.getOperand(0);
5405 SDValue PtrB = Op.getOperand(1);
5406
5407 if (VT.isScalableVT())
5408 return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2));
5409
5410 // We can use the SVE whilewr/whilerw instruction to lower this
5411 // intrinsic by creating the appropriate sequence of scalable vector
5412 // operations and then extracting a fixed-width subvector from the scalable
5413 // vector. Scalable vector variants are already legal.
5414 EVT ContainerVT =
5416 VT.getVectorNumElements(), true);
5417 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
5418
5419 SDValue Mask =
5420 DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2));
5421 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
5422 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
5423 DAG.getVectorIdxConstant(0, DL));
5424}
5425
5426SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5427 SelectionDAG &DAG) const {
5428 EVT OpVT = Op.getValueType();
5429 EVT ArgVT = Op.getOperand(0).getValueType();
5430
5432 return LowerFixedLengthBitcastToSVE(Op, DAG);
5433
5434 if (OpVT.isScalableVector()) {
5435 assert(isTypeLegal(OpVT) && "Unexpected result type!");
5436
5437 // Handle type legalisation first.
5438 if (!isTypeLegal(ArgVT)) {
5439 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5440 "Expected int->fp bitcast!");
5441
5442 // Bitcasting between unpacked vector types of different element counts is
5443 // not a NOP because the live elements are laid out differently.
5444 // 01234567
5445 // e.g. nxv2i32 = XX??XX??
5446 // nxv4f16 = X?X?X?X?
5447 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5448 return SDValue();
5449
5450 SDValue ExtResult =
5451 DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
5452 Op.getOperand(0));
5453 return getSVESafeBitCast(OpVT, ExtResult, DAG);
5454 }
5455
5456 // Bitcasts between legal types with the same element count are legal.
5457 if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5458 return Op;
5459
5460 // getSVESafeBitCast does not support casting between unpacked types.
5461 if (!isPackedVectorType(OpVT, DAG))
5462 return SDValue();
5463
5464 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
5465 }
5466
5467 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5468 return SDValue();
5469
5470 // Bitcasts between f16 and bf16 are legal.
5471 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5472 return Op;
5473
5474 assert(ArgVT == MVT::i16);
5475 SDLoc DL(Op);
5476
5477 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
5478 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
5479 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
5480}
5481
5482// Returns lane if Op extracts from a two-element vector and lane is constant
5483// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5484static std::optional<uint64_t>
5486 SDNode *OpNode = Op.getNode();
5487 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5488 return std::nullopt;
5489
5490 EVT VT = OpNode->getOperand(0).getValueType();
5492 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5493 return std::nullopt;
5494
5495 return C->getZExtValue();
5496}
5497
5499 bool isSigned) {
5500 EVT VT = N.getValueType();
5501
5502 if (N.getOpcode() != ISD::BUILD_VECTOR)
5503 return false;
5504
5505 for (const SDValue &Elt : N->op_values()) {
5507 unsigned EltSize = VT.getScalarSizeInBits();
5508 unsigned HalfSize = EltSize / 2;
5509 if (isSigned) {
5510 if (!isIntN(HalfSize, C->getSExtValue()))
5511 return false;
5512 } else {
5513 if (!isUIntN(HalfSize, C->getZExtValue()))
5514 return false;
5515 }
5516 continue;
5517 }
5518 return false;
5519 }
5520
5521 return true;
5522}
5523
5525 EVT VT = N.getValueType();
5526 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5527 EVT HalfVT = EVT::getVectorVT(
5528 *DAG.getContext(),
5531 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
5532}
5533
5535 return N.getOpcode() == ISD::SIGN_EXTEND ||
5536 N.getOpcode() == ISD::ANY_EXTEND ||
5537 isExtendedBUILD_VECTOR(N, DAG, true);
5538}
5539
5541 return N.getOpcode() == ISD::ZERO_EXTEND ||
5542 N.getOpcode() == ISD::ANY_EXTEND ||
5543 isExtendedBUILD_VECTOR(N, DAG, false);
5544}
5545
5547 unsigned Opcode = N.getOpcode();
5548 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5549 SDValue N0 = N.getOperand(0);
5550 SDValue N1 = N.getOperand(1);
5551 return N0->hasOneUse() && N1->hasOneUse() &&
5552 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5553 }
5554 return false;
5555}
5556
5558 unsigned Opcode = N.getOpcode();
5559 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5560 SDValue N0 = N.getOperand(0);
5561 SDValue N1 = N.getOperand(1);
5562 return N0->hasOneUse() && N1->hasOneUse() &&
5563 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5564 }
5565 return false;
5566}
5567
5568SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5569 SelectionDAG &DAG) const {
5570 // The rounding mode is in bits 23:22 of the FPSCR.
5571 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5572 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5573 // so that the shift + and get folded into a bitfield extract.
5574 SDLoc DL(Op);
5575
5576 SDValue Chain = Op.getOperand(0);
5577 SDValue FPCR_64 = DAG.getNode(
5578 ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other},
5579 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)});
5580 Chain = FPCR_64.getValue(1);
5581 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR_64);
5582 SDValue FltRounds = DAG.getNode(ISD::ADD, DL, MVT::i32, FPCR_32,
5583 DAG.getConstant(1U << 22, DL, MVT::i32));
5584 SDValue RMODE = DAG.getNode(ISD::SRL, DL, MVT::i32, FltRounds,
5585 DAG.getConstant(22, DL, MVT::i32));
5586 SDValue AND = DAG.getNode(ISD::AND, DL, MVT::i32, RMODE,
5587 DAG.getConstant(3, DL, MVT::i32));
5588 return DAG.getMergeValues({AND, Chain}, DL);
5589}
5590
5591SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5592 SelectionDAG &DAG) const {
5593 SDLoc DL(Op);
5594 SDValue Chain = Op->getOperand(0);
5595 SDValue RMValue = Op->getOperand(1);
5596
5597 // The rounding mode is in bits 23:22 of the FPCR.
5598 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5599 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5600 // ((arg - 1) & 3) << 22).
5601 //
5602 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5603 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5604 // generated llvm.set.rounding to ensure this condition.
5605
5606 // Calculate new value of FPCR[23:22].
5607 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5608 DAG.getConstant(1, DL, MVT::i32));
5609 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5610 DAG.getConstant(0x3, DL, MVT::i32));
5611 RMValue =
5612 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5613 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5614 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5615
5616 // Get current value of FPCR.
5617 SDValue Ops[] = {
5618 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5619 SDValue FPCR =
5620 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5621 Chain = FPCR.getValue(1);
5622 FPCR = FPCR.getValue(0);
5623
5624 // Put new rounding mode into FPSCR[23:22].
5625 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5626 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5627 DAG.getConstant(RMMask, DL, MVT::i64));
5628 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5629 SDValue Ops2[] = {
5630 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5631 FPCR};
5632 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5633}
5634
5635SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5636 SelectionDAG &DAG) const {
5637 SDLoc DL(Op);
5638 SDValue Chain = Op->getOperand(0);
5639
5640 // Get current value of FPCR.
5641 SDValue Ops[] = {
5642 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5643 SDValue FPCR =
5644 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5645 Chain = FPCR.getValue(1);
5646 FPCR = FPCR.getValue(0);
5647
5648 // Truncate FPCR to 32 bits.
5649 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5650
5651 return DAG.getMergeValues({Result, Chain}, DL);
5652}
5653
5654SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5655 SelectionDAG &DAG) const {
5656 SDLoc DL(Op);
5657 SDValue Chain = Op->getOperand(0);
5658 SDValue Mode = Op->getOperand(1);
5659
5660 // Extend the specified value to 64 bits.
5661 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5662
5663 // Set new value of FPCR.
5664 SDValue Ops2[] = {
5665 Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
5666 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5667}
5668
5669SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5670 SelectionDAG &DAG) const {
5671 SDLoc DL(Op);
5672 SDValue Chain = Op->getOperand(0);
5673
5674 // Get current value of FPCR.
5675 SDValue Ops[] = {
5676 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5677 SDValue FPCR =
5678 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5679 Chain = FPCR.getValue(1);
5680 FPCR = FPCR.getValue(0);
5681
5682 // Clear bits that are not reserved.
5683 SDValue FPSCRMasked = DAG.getNode(
5684 ISD::AND, DL, MVT::i64, FPCR,
5686
5687 // Set new value of FPCR.
5688 SDValue Ops2[] = {Chain,
5689 DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5690 FPSCRMasked};
5691 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5692}
5693
5694static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5695 SDLoc DL, bool &IsMLA) {
5696 bool IsN0SExt = isSignExtended(N0, DAG);
5697 bool IsN1SExt = isSignExtended(N1, DAG);
5698 if (IsN0SExt && IsN1SExt)
5699 return AArch64ISD::SMULL;
5700
5701 bool IsN0ZExt = isZeroExtended(N0, DAG);
5702 bool IsN1ZExt = isZeroExtended(N1, DAG);
5703
5704 if (IsN0ZExt && IsN1ZExt)
5705 return AArch64ISD::UMULL;
5706
5707 // Select UMULL if we can replace the other operand with an extend.
5708 EVT VT = N0.getValueType();
5709 unsigned EltSize = VT.getScalarSizeInBits();
5710 APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
5711 if (IsN0ZExt || IsN1ZExt) {
5712 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5713 return AArch64ISD::UMULL;
5714 } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
5715 DAG.MaskedValueIsZero(N1, Mask)) {
5716 // For v2i64 we look more aggressively at both operands being zero, to avoid
5717 // scalarization.
5718 return AArch64ISD::UMULL;
5719 }
5720
5721 if (IsN0SExt || IsN1SExt) {
5722 if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
5723 return AArch64ISD::SMULL;
5724 } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
5725 DAG.ComputeNumSignBits(N1) > EltSize / 2) {
5726 return AArch64ISD::SMULL;
5727 }
5728
5729 if (!IsN1SExt && !IsN1ZExt)
5730 return 0;
5731
5732 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5733 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5734 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5735 IsMLA = true;
5736 return AArch64ISD::SMULL;
5737 }
5738 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5739 IsMLA = true;
5740 return AArch64ISD::UMULL;
5741 }
5742 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5743 std::swap(N0, N1);
5744 IsMLA = true;
5745 return AArch64ISD::UMULL;
5746 }
5747 return 0;
5748}
5749
5750SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5751 EVT VT = Op.getValueType();
5752
5753 bool OverrideNEON = !Subtarget->isNeonAvailable();
5754 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5755 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5756
5757 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5758 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5759 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5760 "unexpected type for custom-lowering ISD::MUL");
5761 SDValue N0 = Op.getOperand(0);
5762 SDValue N1 = Op.getOperand(1);
5763 bool isMLA = false;
5764 EVT OVT = VT;
5765 if (VT.is64BitVector()) {
5766 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5767 isNullConstant(N0.getOperand(1)) &&
5769 isNullConstant(N1.getOperand(1))) {
5770 N0 = N0.getOperand(0);
5771 N1 = N1.getOperand(0);
5772 VT = N0.getValueType();
5773 } else {
5774 if (VT == MVT::v1i64) {
5775 if (Subtarget->hasSVE())
5776 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5777 // Fall through to expand this. It is not legal.
5778 return SDValue();
5779 } else
5780 // Other vector multiplications are legal.
5781 return Op;
5782 }
5783 }
5784
5785 SDLoc DL(Op);
5786 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5787
5788 if (!NewOpc) {
5789 if (VT.getVectorElementType() == MVT::i64) {
5790 // If SVE is available then i64 vector multiplications can also be made
5791 // legal.
5792 if (Subtarget->hasSVE())
5793 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5794 // Fall through to expand this. It is not legal.
5795 return SDValue();
5796 } else
5797 // Other vector multiplications are legal.
5798 return Op;
5799 }
5800
5801 // Legalize to a S/UMULL instruction
5802 SDValue Op0;
5803 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5804 if (!isMLA) {
5805 Op0 = skipExtensionForVectorMULL(N0, DAG);
5807 Op1.getValueType().is64BitVector() &&
5808 "unexpected types for extended operands to VMULL");
5809 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5810 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5811 DAG.getConstant(0, DL, MVT::i64));
5812 }
5813 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5814 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5815 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5818 EVT Op1VT = Op1.getValueType();
5819 return DAG.getNode(
5821 DAG.getNode(N0.getOpcode(), DL, VT,
5822 DAG.getNode(NewOpc, DL, VT,
5823 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5824 DAG.getNode(NewOpc, DL, VT,
5825 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5826 DAG.getConstant(0, DL, MVT::i64));
5827}
5828
5829static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5830 int Pattern) {
5831 if (Pattern == AArch64SVEPredPattern::all)
5832 return DAG.getConstant(1, DL, VT);
5833 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5834 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5835}
5836
5838 bool IsSigned, bool IsEqual) {
5839 unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 1 : 0;
5840 unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 2 : 1;
5841
5842 if (!N->getValueType(0).isScalableVector() ||
5843 !isa<ConstantSDNode>(N->getOperand(Op1)))
5844 return SDValue();
5845
5846 SDLoc DL(N);
5847 APInt Y = N->getConstantOperandAPInt(Op1);
5848
5849 // When the second operand is the maximum value, comparisons that include
5850 // equality can never fail and thus we can return an all active predicate.
5851 if (IsEqual)
5852 if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
5853 return DAG.getConstant(1, DL, N->getValueType(0));
5854
5855 if (!isa<ConstantSDNode>(N->getOperand(Op0)))
5856 return SDValue();
5857
5858 APInt X = N->getConstantOperandAPInt(Op0);
5859
5860 bool Overflow;
5861 APInt NumActiveElems =
5862 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5863
5864 if (Overflow)
5865 return SDValue();
5866
5867 if (IsEqual) {
5868 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5869 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5870 : NumActiveElems.uadd_ov(One, Overflow);
5871 if (Overflow)
5872 return SDValue();
5873 }
5874
5875 std::optional<unsigned> PredPattern =
5877 unsigned MinSVEVectorSize = std::max(
5879 unsigned ElementSize = 128 / N->getValueType(0).getVectorMinNumElements();
5880 if (PredPattern != std::nullopt &&
5881 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5882 return getPTrue(DAG, DL, N->getValueType(0), *PredPattern);
5883
5884 return SDValue();
5885}
5886
5887// Returns a safe bitcast between two scalable vector predicates, where
5888// any newly created lanes from a widening bitcast are defined as zero.
5890 SDLoc DL(Op);
5891 EVT InVT = Op.getValueType();
5892
5893 assert(InVT.getVectorElementType() == MVT::i1 &&
5894 VT.getVectorElementType() == MVT::i1 &&
5895 "Expected a predicate-to-predicate bitcast");
5897 InVT.isScalableVector() &&
5898 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5899 "Only expect to cast between legal scalable predicate types!");
5900
5901 // Return the operand if the cast isn't changing type,
5902 if (InVT == VT)
5903 return Op;
5904
5905 // Look through casts to <vscale x 16 x i1> when their input has more lanes
5906 // than VT. This will increase the chances of removing casts that introduce
5907 // new lanes, which have to be explicitly zero'd.
5908 if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
5909 Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&
5910 Op.getOperand(1).getValueType().bitsGT(VT))
5911 Op = Op.getOperand(1);
5912
5913 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5914
5915 // We only have to zero the lanes if new lanes are being defined, e.g. when
5916 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5917 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5918 // we can return here.
5919 if (InVT.bitsGT(VT))
5920 return Reinterpret;
5921
5922 // Check if the other lanes are already known to be zeroed by
5923 // construction.
5925 return Reinterpret;
5926
5927 // Zero the newly introduced lanes.
5928 SDValue Mask = DAG.getConstant(1, DL, InVT);
5929 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5930 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5931}
5932
5933SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5934 SDValue Chain, SDLoc DL,
5935 EVT VT) const {
5936 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
5939 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5940 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5941 TargetLowering::CallLoweringInfo CLI(DAG);
5943 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5944 getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
5945 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5946 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5947 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5948 Mask);
5949}
5950
5951// Lower an SME LDR/STR ZA intrinsic
5952// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5953// folded into the instruction
5954// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5955// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5956// and tile slice registers
5957// ldr(%tileslice, %ptr, %vecnum)
5958// ->
5959// %svl = rdsvl
5960// %ptr2 = %ptr + %svl * %vecnum
5961// %tileslice2 = %tileslice + %vecnum
5962// ldr [%tileslice2, 0], [%ptr2, 0]
5963// Case 3: If the vecnum is an immediate out of range, then the same is done as
5964// case 2, but the base and slice registers are modified by the greatest
5965// multiple of 15 lower than the vecnum and the remainder is folded into the
5966// instruction. This means that successive loads and stores that are offset from
5967// each other can share the same base and slice register updates.
5968// ldr(%tileslice, %ptr, 22)
5969// ldr(%tileslice, %ptr, 23)
5970// ->
5971// %svl = rdsvl
5972// %ptr2 = %ptr + %svl * 15
5973// %tileslice2 = %tileslice + 15
5974// ldr [%tileslice2, 7], [%ptr2, 7]
5975// ldr [%tileslice2, 8], [%ptr2, 8]
5976// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5977// operand and the immediate can be folded into the instruction, like case 2.
5978// ldr(%tileslice, %ptr, %vecnum + 7)
5979// ldr(%tileslice, %ptr, %vecnum + 8)
5980// ->
5981// %svl = rdsvl
5982// %ptr2 = %ptr + %svl * %vecnum
5983// %tileslice2 = %tileslice + %vecnum
5984// ldr [%tileslice2, 7], [%ptr2, 7]
5985// ldr [%tileslice2, 8], [%ptr2, 8]
5986// Case 5: The vecnum being an add of an immediate out of range is also handled,
5987// in which case the same remainder logic as case 3 is used.
5989 SDLoc DL(N);
5990
5991 SDValue TileSlice = N->getOperand(2);
5992 SDValue Base = N->getOperand(3);
5993 SDValue VecNum = N->getOperand(4);
5994 int32_t ConstAddend = 0;
5995 SDValue VarAddend = VecNum;
5996
5997 // If the vnum is an add of an immediate, we can fold it into the instruction
5998 if (VecNum.getOpcode() == ISD::ADD &&
5999 isa<ConstantSDNode>(VecNum.getOperand(1))) {
6000 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
6001 VarAddend = VecNum.getOperand(0);
6002 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
6003 ConstAddend = ImmNode->getSExtValue();
6004 VarAddend = SDValue();
6005 }
6006
6007 int32_t ImmAddend = ConstAddend % 16;
6008 if (int32_t C = (ConstAddend - ImmAddend)) {
6009 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
6010 VarAddend = VarAddend
6011 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
6012 : CVal;
6013 }
6014
6015 if (VarAddend) {
6016 // Get the vector length that will be multiplied by vnum
6017 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6018 DAG.getConstant(1, DL, MVT::i32));
6019
6020 // Multiply SVL and vnum then add it to the base
6021 SDValue Mul = DAG.getNode(
6022 ISD::MUL, DL, MVT::i64,
6023 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
6024 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
6025 // Just add vnum to the tileslice
6026 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
6027 }
6028
6029 return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
6030 DL, MVT::Other,
6031 {/*Chain=*/N.getOperand(0), TileSlice, Base,
6032 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
6033}
6034
6036 SDLoc DL(Op);
6037 SDValue ID =
6038 DAG.getTargetConstant(Intrinsic::aarch64_sve_match, DL, MVT::i64);
6039
6040 auto Op1 = Op.getOperand(1);
6041 auto Op2 = Op.getOperand(2);
6042 auto Mask = Op.getOperand(3);
6043
6044 EVT Op1VT = Op1.getValueType();
6045 EVT Op2VT = Op2.getValueType();
6046 EVT ResVT = Op.getValueType();
6047
6048 assert((Op1VT.getVectorElementType() == MVT::i8 ||
6049 Op1VT.getVectorElementType() == MVT::i16) &&
6050 "Expected 8-bit or 16-bit characters.");
6051
6052 // Scalable vector type used to wrap operands.
6053 // A single container is enough for both operands because ultimately the
6054 // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
6055 EVT OpContainerVT = Op1VT.isScalableVector()
6056 ? Op1VT
6058
6059 if (Op2VT.is128BitVector()) {
6060 // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
6061 Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
6062 // Further, if the result is scalable, broadcast Op2 to a full SVE register.
6063 if (ResVT.isScalableVector())
6064 Op2 = DAG.getNode(AArch64ISD::DUPLANE128, DL, OpContainerVT, Op2,
6065 DAG.getTargetConstant(0, DL, MVT::i64));
6066 } else {
6067 // If Op2 is not a full 128-bit vector, we always need to broadcast it.
6068 unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
6069 MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
6070 EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
6071 Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
6072 Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op2IntVT, Op2,
6073 DAG.getConstant(0, DL, MVT::i64));
6074 Op2 = DAG.getSplatVector(Op2PromotedVT, DL, Op2);
6075 Op2 = DAG.getBitcast(OpContainerVT, Op2);
6076 }
6077
6078 // If the result is scalable, we just need to carry out the MATCH.
6079 if (ResVT.isScalableVector())
6080 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResVT, ID, Mask, Op1, Op2);
6081
6082 // If the result is fixed, we can still use MATCH but we need to wrap the
6083 // first operand and the mask in scalable vectors before doing so.
6084
6085 // Wrap the operands.
6086 Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
6087 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, Op1VT, Mask);
6088 Mask = convertFixedMaskToScalableVector(Mask, DAG);
6089
6090 // Carry out the match.
6091 SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Mask.getValueType(),
6092 ID, Mask, Op1, Op2);
6093
6094 // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
6095 // (v16i8/v8i8).
6096 Match = DAG.getNode(ISD::SIGN_EXTEND, DL, OpContainerVT, Match);
6097 Match = convertFromScalableVector(DAG, Op1VT, Match);
6098 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Match);
6099}
6100
6101SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6102 SelectionDAG &DAG) const {
6103 unsigned IntNo = Op.getConstantOperandVal(1);
6104 SDLoc DL(Op);
6105 switch (IntNo) {
6106 default:
6107 return SDValue(); // Don't custom lower most intrinsics.
6108 case Intrinsic::aarch64_prefetch: {
6109 SDValue Chain = Op.getOperand(0);
6110 SDValue Addr = Op.getOperand(2);
6111
6112 unsigned IsWrite = Op.getConstantOperandVal(3);
6113 unsigned Locality = Op.getConstantOperandVal(4);
6114 unsigned IsStream = Op.getConstantOperandVal(5);
6115 unsigned IsData = Op.getConstantOperandVal(6);
6116 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
6117 (!IsData << 3) | // IsDataCache bit
6118 (Locality << 1) | // Cache level bits
6119 (unsigned)IsStream; // Stream bit
6120
6121 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
6122 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
6123 }
6124 case Intrinsic::aarch64_sme_str:
6125 case Intrinsic::aarch64_sme_ldr: {
6126 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
6127 }
6128 case Intrinsic::aarch64_sme_za_enable:
6129 return DAG.getNode(
6130 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6131 Op->getOperand(0), // Chain
6132 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6133 case Intrinsic::aarch64_sme_za_disable:
6134 return DAG.getNode(
6135 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6136 Op->getOperand(0), // Chain
6137 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6138 }
6139}
6140
6141SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
6142 SelectionDAG &DAG) const {
6143 unsigned IntNo = Op.getConstantOperandVal(1);
6144 SDLoc DL(Op);
6145 switch (IntNo) {
6146 default:
6147 return SDValue(); // Don't custom lower most intrinsics.
6148 case Intrinsic::aarch64_mops_memset_tag: {
6149 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
6150 SDValue Chain = Node->getChain();
6151 SDValue Dst = Op.getOperand(2);
6152 SDValue Val = Op.getOperand(3);
6153 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
6154 SDValue Size = Op.getOperand(4);
6155 auto Alignment = Node->getMemOperand()->getAlign();
6156 bool IsVol = Node->isVolatile();
6157 auto DstPtrInfo = Node->getPointerInfo();
6158
6159 const auto &SDI =
6160 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
6161 SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
6162 Chain, Dst, Val, Size, Alignment, IsVol,
6163 DstPtrInfo, MachinePointerInfo{});
6164
6165 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
6166 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
6167 // LowerOperationWrapper will complain that the number of results has
6168 // changed.
6169 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
6170 }
6171 }
6172}
6173
6174SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6175 SelectionDAG &DAG) const {
6176 unsigned IntNo = Op.getConstantOperandVal(0);
6177 SDLoc DL(Op);
6178 switch (IntNo) {
6179 default: return SDValue(); // Don't custom lower most intrinsics.
6180 case Intrinsic::thread_pointer: {
6181 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6182 return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
6183 }
6184 case Intrinsic::aarch64_sve_whilewr_b:
6185 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6186 Op.getOperand(1), Op.getOperand(2),
6187 DAG.getConstant(1, DL, MVT::i64));
6188 case Intrinsic::aarch64_sve_whilewr_h:
6189 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6190 Op.getOperand(1), Op.getOperand(2),
6191 DAG.getConstant(2, DL, MVT::i64));
6192 case Intrinsic::aarch64_sve_whilewr_s:
6193 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6194 Op.getOperand(1), Op.getOperand(2),
6195 DAG.getConstant(4, DL, MVT::i64));
6196 case Intrinsic::aarch64_sve_whilewr_d:
6197 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6198 Op.getOperand(1), Op.getOperand(2),
6199 DAG.getConstant(8, DL, MVT::i64));
6200 case Intrinsic::aarch64_sve_whilerw_b:
6201 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6202 Op.getOperand(1), Op.getOperand(2),
6203 DAG.getConstant(1, DL, MVT::i64));
6204 case Intrinsic::aarch64_sve_whilerw_h:
6205 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6206 Op.getOperand(1), Op.getOperand(2),
6207 DAG.getConstant(2, DL, MVT::i64));
6208 case Intrinsic::aarch64_sve_whilerw_s:
6209 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6210 Op.getOperand(1), Op.getOperand(2),
6211 DAG.getConstant(4, DL, MVT::i64));
6212 case Intrinsic::aarch64_sve_whilerw_d:
6213 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6214 Op.getOperand(1), Op.getOperand(2),
6215 DAG.getConstant(8, DL, MVT::i64));
6216 case Intrinsic::aarch64_neon_abs: {
6217 EVT Ty = Op.getValueType();
6218 if (Ty == MVT::i64) {
6219 SDValue Result =
6220 DAG.getNode(ISD::BITCAST, DL, MVT::v1i64, Op.getOperand(1));
6221 Result = DAG.getNode(ISD::ABS, DL, MVT::v1i64, Result);
6222 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Result);
6223 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
6224 return DAG.getNode(ISD::ABS, DL, Ty, Op.getOperand(1));
6225 } else {
6226 report_fatal_error("Unexpected type for AArch64 NEON intrinsic");
6227 }
6228 }
6229 case Intrinsic::aarch64_neon_pmull64: {
6230 SDValue LHS = Op.getOperand(1);
6231 SDValue RHS = Op.getOperand(2);
6232
6233 std::optional<uint64_t> LHSLane =
6235 std::optional<uint64_t> RHSLane =
6237
6238 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
6239 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
6240
6241 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
6242 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
6243 // which ISel recognizes better. For example, generate a ldr into d*
6244 // registers as opposed to a GPR load followed by a fmov.
6245 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
6246 std::optional<uint64_t> OtherLane,
6247 const SDLoc &DL,
6248 SelectionDAG &DAG) -> SDValue {
6249 // If the operand is an higher half itself, rewrite it to
6250 // extract_high_v2i64; this way aarch64_neon_pmull64 could
6251 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
6252 if (NLane == 1)
6253 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6254 N.getOperand(0), DAG.getConstant(1, DL, MVT::i64));
6255
6256 // Operand N is not a higher half but the other operand is.
6257 if (OtherLane == 1) {
6258 // If this operand is a lower half, rewrite it to
6259 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
6260 // align lanes of two operands. A roundtrip sequence (to move from lane
6261 // 1 to lane 0) is like this:
6262 // mov x8, v0.d[1]
6263 // fmov d0, x8
6264 if (NLane == 0)
6265 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6266 DAG.getNode(AArch64ISD::DUPLANE64, DL, MVT::v2i64,
6267 N.getOperand(0),
6268 DAG.getConstant(0, DL, MVT::i64)),
6269 DAG.getConstant(1, DL, MVT::i64));
6270
6271 // Otherwise just dup from main to all lanes.
6272 return DAG.getNode(AArch64ISD::DUP, DL, MVT::v1i64, N);
6273 }
6274
6275 // Neither operand is an extract of higher half, so codegen may just use
6276 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
6277 assert(N.getValueType() == MVT::i64 &&
6278 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
6279 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, N);
6280 };
6281
6282 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, DL, DAG);
6283 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, DL, DAG);
6284
6285 return DAG.getNode(AArch64ISD::PMULL, DL, Op.getValueType(), LHS, RHS);
6286 }
6287 case Intrinsic::aarch64_neon_smax:
6288 return DAG.getNode(ISD::SMAX, DL, Op.getValueType(), Op.getOperand(1),
6289 Op.getOperand(2));
6290 case Intrinsic::aarch64_neon_umax:
6291 return DAG.getNode(ISD::UMAX, DL, Op.getValueType(), Op.getOperand(1),
6292 Op.getOperand(2));
6293 case Intrinsic::aarch64_neon_smin:
6294 return DAG.getNode(ISD::SMIN, DL, Op.getValueType(), Op.getOperand(1),
6295 Op.getOperand(2));
6296 case Intrinsic::aarch64_neon_umin:
6297 return DAG.getNode(ISD::UMIN, DL, Op.getValueType(), Op.getOperand(1),
6298 Op.getOperand(2));
6299 case Intrinsic::aarch64_neon_scalar_sqxtn:
6300 case Intrinsic::aarch64_neon_scalar_sqxtun:
6301 case Intrinsic::aarch64_neon_scalar_uqxtn: {
6302 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
6303 if (Op.getValueType() == MVT::i32)
6304 return DAG.getNode(ISD::BITCAST, DL, MVT::i32,
6305 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
6306 Op.getOperand(0),
6307 DAG.getNode(ISD::BITCAST, DL, MVT::f64,
6308 Op.getOperand(1))));
6309 return SDValue();
6310 }
6311 case Intrinsic::aarch64_neon_sqxtn:
6312 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6313 Op.getOperand(1));
6314 case Intrinsic::aarch64_neon_sqxtun:
6315 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6316 Op.getOperand(1));
6317 case Intrinsic::aarch64_neon_uqxtn:
6318 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6319 Op.getOperand(1));
6320 case Intrinsic::aarch64_neon_sqshrn:
6321 if (Op.getValueType().isVector())
6322 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6323 DAG.getNode(AArch64ISD::VASHR, DL,
6324 Op.getOperand(1).getValueType(),
6325 Op.getOperand(1), Op.getOperand(2)));
6326 return SDValue();
6327 case Intrinsic::aarch64_neon_sqshrun:
6328 if (Op.getValueType().isVector())
6329 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6330 DAG.getNode(AArch64ISD::VASHR, DL,
6331 Op.getOperand(1).getValueType(),
6332 Op.getOperand(1), Op.getOperand(2)));
6333 return SDValue();
6334 case Intrinsic::aarch64_neon_uqshrn:
6335 if (Op.getValueType().isVector())
6336 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6337 DAG.getNode(AArch64ISD::VLSHR, DL,
6338 Op.getOperand(1).getValueType(),
6339 Op.getOperand(1), Op.getOperand(2)));
6340 return SDValue();
6341 case Intrinsic::aarch64_neon_sqrshrn:
6342 if (Op.getValueType().isVector())
6343 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6344 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6345 Op.getOperand(1).getValueType(),
6346 Op.getOperand(1), Op.getOperand(2)));
6347 return SDValue();
6348 case Intrinsic::aarch64_neon_sqrshrun:
6349 if (Op.getValueType().isVector())
6350 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6351 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6352 Op.getOperand(1).getValueType(),
6353 Op.getOperand(1), Op.getOperand(2)));
6354 return SDValue();
6355 case Intrinsic::aarch64_neon_uqrshrn:
6356 if (Op.getValueType().isVector())
6357 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6358 DAG.getNode(AArch64ISD::URSHR_I, DL,
6359 Op.getOperand(1).getValueType(),
6360 Op.getOperand(1), Op.getOperand(2)));
6361 return SDValue();
6362 case Intrinsic::aarch64_neon_sqadd:
6363 if (Op.getValueType().isVector())
6364 return DAG.getNode(ISD::SADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6365 Op.getOperand(2));
6366 return SDValue();
6367 case Intrinsic::aarch64_neon_sqsub:
6368 if (Op.getValueType().isVector())
6369 return DAG.getNode(ISD::SSUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6370 Op.getOperand(2));
6371 return SDValue();
6372 case Intrinsic::aarch64_neon_uqadd:
6373 if (Op.getValueType().isVector())
6374 return DAG.getNode(ISD::UADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6375 Op.getOperand(2));
6376 return SDValue();
6377 case Intrinsic::aarch64_neon_uqsub:
6378 if (Op.getValueType().isVector())
6379 return DAG.getNode(ISD::USUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6380 Op.getOperand(2));
6381 return SDValue();
6382 case Intrinsic::aarch64_sve_whilelt:
6383 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6384 /*IsEqual=*/false);
6385 case Intrinsic::aarch64_sve_whilels:
6386 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/false,
6387 /*IsEqual=*/true);
6388 case Intrinsic::aarch64_sve_whilele:
6389 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6390 /*IsEqual=*/true);
6391 case Intrinsic::aarch64_sve_sunpkhi:
6392 return DAG.getNode(AArch64ISD::SUNPKHI, DL, Op.getValueType(),
6393 Op.getOperand(1));
6394 case Intrinsic::aarch64_sve_sunpklo:
6395 return DAG.getNode(AArch64ISD::SUNPKLO, DL, Op.getValueType(),
6396 Op.getOperand(1));
6397 case Intrinsic::aarch64_sve_uunpkhi:
6398 return DAG.getNode(AArch64ISD::UUNPKHI, DL, Op.getValueType(),
6399 Op.getOperand(1));
6400 case Intrinsic::aarch64_sve_uunpklo:
6401 return DAG.getNode(AArch64ISD::UUNPKLO, DL, Op.getValueType(),
6402 Op.getOperand(1));
6403 case Intrinsic::aarch64_sve_clasta_n:
6404 return DAG.getNode(AArch64ISD::CLASTA_N, DL, Op.getValueType(),
6405 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6406 case Intrinsic::aarch64_sve_clastb_n:
6407 return DAG.getNode(AArch64ISD::CLASTB_N, DL, Op.getValueType(),
6408 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6409 case Intrinsic::aarch64_sve_lasta:
6410 return DAG.getNode(AArch64ISD::LASTA, DL, Op.getValueType(),
6411 Op.getOperand(1), Op.getOperand(2));
6412 case Intrinsic::aarch64_sve_lastb:
6413 return DAG.getNode(AArch64ISD::LASTB, DL, Op.getValueType(),
6414 Op.getOperand(1), Op.getOperand(2));
6415 case Intrinsic::aarch64_sve_rev:
6416 return DAG.getNode(ISD::VECTOR_REVERSE, DL, Op.getValueType(),
6417 Op.getOperand(1));
6418 case Intrinsic::aarch64_sve_tbl:
6419 return DAG.getNode(AArch64ISD::TBL, DL, Op.getValueType(), Op.getOperand(1),
6420 Op.getOperand(2));
6421 case Intrinsic::aarch64_sve_trn1:
6422 return DAG.getNode(AArch64ISD::TRN1, DL, Op.getValueType(),
6423 Op.getOperand(1), Op.getOperand(2));
6424 case Intrinsic::aarch64_sve_trn2:
6425 return DAG.getNode(AArch64ISD::TRN2, DL, Op.getValueType(),
6426 Op.getOperand(1), Op.getOperand(2));
6427 case Intrinsic::aarch64_sve_uzp1:
6428 return DAG.getNode(AArch64ISD::UZP1, DL, Op.getValueType(),
6429 Op.getOperand(1), Op.getOperand(2));
6430 case Intrinsic::aarch64_sve_uzp2:
6431 return DAG.getNode(AArch64ISD::UZP2, DL, Op.getValueType(),
6432 Op.getOperand(1), Op.getOperand(2));
6433 case Intrinsic::aarch64_sve_zip1:
6434 return DAG.getNode(AArch64ISD::ZIP1, DL, Op.getValueType(),
6435 Op.getOperand(1), Op.getOperand(2));
6436 case Intrinsic::aarch64_sve_zip2:
6437 return DAG.getNode(AArch64ISD::ZIP2, DL, Op.getValueType(),
6438 Op.getOperand(1), Op.getOperand(2));
6439 case Intrinsic::aarch64_sve_splice:
6440 return DAG.getNode(AArch64ISD::SPLICE, DL, Op.getValueType(),
6441 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6442 case Intrinsic::aarch64_sve_ptrue:
6443 return getPTrue(DAG, DL, Op.getValueType(), Op.getConstantOperandVal(1));
6444 case Intrinsic::aarch64_sve_clz:
6445 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, DL, Op.getValueType(),
6446 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6447 case Intrinsic::aarch64_sme_cntsd: {
6448 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
6449 DAG.getConstant(1, DL, MVT::i32));
6450 return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes,
6451 DAG.getConstant(3, DL, MVT::i32), SDNodeFlags::Exact);
6452 }
6453 case Intrinsic::aarch64_sve_cnt: {
6454 SDValue Data = Op.getOperand(3);
6455 // CTPOP only supports integer operands.
6456 if (Data.getValueType().isFloatingPoint())
6457 Data = DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Data);
6458 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, DL, Op.getValueType(),
6459 Op.getOperand(2), Data, Op.getOperand(1));
6460 }
6461 case Intrinsic::aarch64_sve_dupq_lane:
6462 return LowerDUPQLane(Op, DAG);
6463 case Intrinsic::aarch64_sve_convert_from_svbool:
6464 if (Op.getValueType() == MVT::aarch64svcount)
6465 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Op.getOperand(1));
6466 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
6467 case Intrinsic::aarch64_sve_convert_to_svbool:
6468 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
6469 return DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, Op.getOperand(1));
6470 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
6471 case Intrinsic::aarch64_sve_fneg:
6472 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6473 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6474 case Intrinsic::aarch64_sve_frintp:
6475 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, DL, Op.getValueType(),
6476 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6477 case Intrinsic::aarch64_sve_frintm:
6478 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL, Op.getValueType(),
6479 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6480 case Intrinsic::aarch64_sve_frinti:
6481 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL,
6482 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6483 Op.getOperand(1));
6484 case Intrinsic::aarch64_sve_frintx:
6485 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, DL, Op.getValueType(),
6486 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6487 case Intrinsic::aarch64_sve_frinta:
6488 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, DL, Op.getValueType(),
6489 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6490 case Intrinsic::aarch64_sve_frintn:
6491 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL,
6492 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6493 Op.getOperand(1));
6494 case Intrinsic::aarch64_sve_frintz:
6495 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL, Op.getValueType(),
6496 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6497 case Intrinsic::aarch64_sve_ucvtf:
6498 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL,
6499 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6500 Op.getOperand(1));
6501 case Intrinsic::aarch64_sve_scvtf:
6502 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL,
6503 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6504 Op.getOperand(1));
6505 case Intrinsic::aarch64_sve_fcvtzu:
6506 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL, Op.getValueType(),
6507 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6508 case Intrinsic::aarch64_sve_fcvtzs:
6509 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL, Op.getValueType(),
6510 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6511 case Intrinsic::aarch64_sve_fsqrt:
6512 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, DL, Op.getValueType(),
6513 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6514 case Intrinsic::aarch64_sve_frecpx:
6515 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, DL, Op.getValueType(),
6516 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6517 case Intrinsic::aarch64_sve_frecpe_x:
6518 return DAG.getNode(AArch64ISD::FRECPE, DL, Op.getValueType(),
6519 Op.getOperand(1));
6520 case Intrinsic::aarch64_sve_frecps_x:
6521 return DAG.getNode(AArch64ISD::FRECPS, DL, Op.getValueType(),
6522 Op.getOperand(1), Op.getOperand(2));
6523 case Intrinsic::aarch64_sve_frsqrte_x:
6524 return DAG.getNode(AArch64ISD::FRSQRTE, DL, Op.getValueType(),
6525 Op.getOperand(1));
6526 case Intrinsic::aarch64_sve_frsqrts_x:
6527 return DAG.getNode(AArch64ISD::FRSQRTS, DL, Op.getValueType(),
6528 Op.getOperand(1), Op.getOperand(2));
6529 case Intrinsic::aarch64_sve_fabs:
6530 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6531 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6532 case Intrinsic::aarch64_sve_abs:
6533 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6534 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6535 case Intrinsic::aarch64_sve_neg:
6536 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6537 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6538 case Intrinsic::aarch64_sve_insr: {
6539 SDValue Scalar = Op.getOperand(2);
6540 EVT ScalarTy = Scalar.getValueType();
6541 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6542 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
6543
6544 return DAG.getNode(AArch64ISD::INSR, DL, Op.getValueType(),
6545 Op.getOperand(1), Scalar);
6546 }
6547 case Intrinsic::aarch64_sve_rbit:
6548 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL,
6549 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6550 Op.getOperand(1));
6551 case Intrinsic::aarch64_sve_revb:
6552 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, DL, Op.getValueType(),
6553 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6554 case Intrinsic::aarch64_sve_revh:
6555 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, DL, Op.getValueType(),
6556 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6557 case Intrinsic::aarch64_sve_revw:
6558 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, DL, Op.getValueType(),
6559 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6560 case Intrinsic::aarch64_sve_revd:
6561 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, Op.getValueType(),
6562 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6563 case Intrinsic::aarch64_sve_sxtb:
6564 return DAG.getNode(
6565 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6566 Op.getOperand(2), Op.getOperand(3),
6567 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6568 Op.getOperand(1));
6569 case Intrinsic::aarch64_sve_sxth:
6570 return DAG.getNode(
6571 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6572 Op.getOperand(2), Op.getOperand(3),
6573 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6574 Op.getOperand(1));
6575 case Intrinsic::aarch64_sve_sxtw:
6576 return DAG.getNode(
6577 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6578 Op.getOperand(2), Op.getOperand(3),
6579 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6580 Op.getOperand(1));
6581 case Intrinsic::aarch64_sve_uxtb:
6582 return DAG.getNode(
6583 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6584 Op.getOperand(2), Op.getOperand(3),
6585 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6586 Op.getOperand(1));
6587 case Intrinsic::aarch64_sve_uxth:
6588 return DAG.getNode(
6589 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6590 Op.getOperand(2), Op.getOperand(3),
6591 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6592 Op.getOperand(1));
6593 case Intrinsic::aarch64_sve_uxtw:
6594 return DAG.getNode(
6595 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6596 Op.getOperand(2), Op.getOperand(3),
6597 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6598 Op.getOperand(1));
6599 case Intrinsic::localaddress: {
6600 const auto &MF = DAG.getMachineFunction();
6601 const auto *RegInfo = Subtarget->getRegisterInfo();
6602 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6603 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg,
6604 Op.getSimpleValueType());
6605 }
6606
6607 case Intrinsic::eh_recoverfp: {
6608 // FIXME: This needs to be implemented to correctly handle highly aligned
6609 // stack objects. For now we simply return the incoming FP. Refer D53541
6610 // for more details.
6611 SDValue FnOp = Op.getOperand(1);
6612 SDValue IncomingFPOp = Op.getOperand(2);
6613 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
6614 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
6615 if (!Fn)
6617 "llvm.eh.recoverfp must take a function as the first argument");
6618 return IncomingFPOp;
6619 }
6620
6621 case Intrinsic::aarch64_neon_vsri:
6622 case Intrinsic::aarch64_neon_vsli:
6623 case Intrinsic::aarch64_sve_sri:
6624 case Intrinsic::aarch64_sve_sli: {
6625 EVT Ty = Op.getValueType();
6626
6627 if (!Ty.isVector())
6628 report_fatal_error("Unexpected type for aarch64_neon_vsli");
6629
6630 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6631
6632 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6633 IntNo == Intrinsic::aarch64_sve_sri;
6634 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6635 return DAG.getNode(Opcode, DL, Ty, Op.getOperand(1), Op.getOperand(2),
6636 Op.getOperand(3));
6637 }
6638
6639 case Intrinsic::aarch64_neon_srhadd:
6640 case Intrinsic::aarch64_neon_urhadd:
6641 case Intrinsic::aarch64_neon_shadd:
6642 case Intrinsic::aarch64_neon_uhadd: {
6643 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6644 IntNo == Intrinsic::aarch64_neon_shadd);
6645 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6646 IntNo == Intrinsic::aarch64_neon_urhadd);
6647 unsigned Opcode = IsSignedAdd
6648 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6649 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6650 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6651 Op.getOperand(2));
6652 }
6653 case Intrinsic::aarch64_neon_saddlp:
6654 case Intrinsic::aarch64_neon_uaddlp: {
6655 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6656 ? AArch64ISD::UADDLP
6657 : AArch64ISD::SADDLP;
6658 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1));
6659 }
6660 case Intrinsic::aarch64_neon_sdot:
6661 case Intrinsic::aarch64_neon_udot:
6662 case Intrinsic::aarch64_sve_sdot:
6663 case Intrinsic::aarch64_sve_udot: {
6664 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6665 IntNo == Intrinsic::aarch64_sve_udot)
6666 ? AArch64ISD::UDOT
6667 : AArch64ISD::SDOT;
6668 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6669 Op.getOperand(2), Op.getOperand(3));
6670 }
6671 case Intrinsic::aarch64_neon_usdot:
6672 case Intrinsic::aarch64_sve_usdot: {
6673 return DAG.getNode(AArch64ISD::USDOT, DL, Op.getValueType(),
6674 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6675 }
6676 case Intrinsic::aarch64_neon_saddlv:
6677 case Intrinsic::aarch64_neon_uaddlv: {
6678 EVT OpVT = Op.getOperand(1).getValueType();
6679 EVT ResVT = Op.getValueType();
6680 assert(
6681 ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6682 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6683 (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6684 "Unexpected aarch64_neon_u/saddlv type");
6685 (void)OpVT;
6686 // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6687 SDValue ADDLV = DAG.getNode(
6688 IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6689 : AArch64ISD::SADDLV,
6690 DL, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6691 SDValue EXTRACT_VEC_ELT = DAG.getNode(
6692 ISD::EXTRACT_VECTOR_ELT, DL, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6693 ADDLV, DAG.getConstant(0, DL, MVT::i64));
6694 return EXTRACT_VEC_ELT;
6695 }
6696 case Intrinsic::experimental_cttz_elts: {
6697 SDValue CttzOp = Op.getOperand(1);
6698 EVT VT = CttzOp.getValueType();
6699 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6700
6701 if (VT.isFixedLengthVector()) {
6702 // We can use SVE instructions to lower this intrinsic by first creating
6703 // an SVE predicate register mask from the fixed-width vector.
6704 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6705 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, CttzOp);
6706 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6707 }
6708
6709 SDValue NewCttzElts =
6710 DAG.getNode(AArch64ISD::CTTZ_ELTS, DL, MVT::i64, CttzOp);
6711 return DAG.getZExtOrTrunc(NewCttzElts, DL, Op.getValueType());
6712 }
6713 case Intrinsic::experimental_vector_match: {
6714 return LowerVectorMatch(Op, DAG);
6715 }
6716 }
6717}
6718
6719bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6720 if (VT.getVectorElementType() == MVT::i8 ||
6721 VT.getVectorElementType() == MVT::i16) {
6722 EltTy = MVT::i32;
6723 return true;
6724 }
6725 return false;
6726}
6727
6728bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6729 EVT DataVT) const {
6730 const EVT IndexVT = Extend.getOperand(0).getValueType();
6731 // SVE only supports implicit extension of 32-bit indices.
6732 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6733 return false;
6734
6735 // Indices cannot be smaller than the main data type.
6736 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6737 return false;
6738
6739 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6740 // element container type, which would violate the previous clause.
6741 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6742}
6743
6744bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6745 EVT ExtVT = ExtVal.getValueType();
6746 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6747 return false;
6748
6749 // It may be worth creating extending masked loads if there are multiple
6750 // masked loads using the same predicate. That way we'll end up creating
6751 // extending masked loads that may then get split by the legaliser. This
6752 // results in just one set of predicate unpacks at the start, instead of
6753 // multiple sets of vector unpacks after each load.
6754 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
6755 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
6756 // Disable extending masked loads for fixed-width for now, since the code
6757 // quality doesn't look great.
6758 if (!ExtVT.isScalableVector())
6759 return false;
6760
6761 unsigned NumExtMaskedLoads = 0;
6762 for (auto *U : Ld->getMask()->users())
6763 if (isa<MaskedLoadSDNode>(U))
6764 NumExtMaskedLoads++;
6765
6766 if (NumExtMaskedLoads <= 1)
6767 return false;
6768 }
6769 }
6770
6771 EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType();
6772 return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 ||
6773 PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64;
6774}
6775
6776unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6777 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6778 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
6779 AArch64ISD::GLD1_MERGE_ZERO},
6780 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
6781 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
6782 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
6783 AArch64ISD::GLD1_MERGE_ZERO},
6784 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
6785 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
6786 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
6787 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6788 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
6789 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
6790 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
6791 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6792 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
6793 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
6794 };
6795 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
6796 return AddrModes.find(Key)->second;
6797}
6798
6799unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6800 switch (Opcode) {
6801 default:
6802 llvm_unreachable("unimplemented opcode");
6803 return Opcode;
6804 case AArch64ISD::GLD1_MERGE_ZERO:
6805 return AArch64ISD::GLD1S_MERGE_ZERO;
6806 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
6807 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
6808 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
6809 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
6810 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
6811 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
6812 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
6813 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
6814 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
6815 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
6816 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
6817 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
6818 }
6819}
6820
6821SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6822 SelectionDAG &DAG) const {
6823 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
6824
6825 SDLoc DL(Op);
6826 SDValue Chain = MGT->getChain();
6827 SDValue PassThru = MGT->getPassThru();
6828 SDValue Mask = MGT->getMask();
6829 SDValue BasePtr = MGT->getBasePtr();
6830 SDValue Index = MGT->getIndex();
6831 SDValue Scale = MGT->getScale();
6832 EVT VT = Op.getValueType();
6833 EVT MemVT = MGT->getMemoryVT();
6834 ISD::LoadExtType ExtType = MGT->getExtensionType();
6835 ISD::MemIndexType IndexType = MGT->getIndexType();
6836
6837 // SVE supports zero (and so undef) passthrough values only, everything else
6838 // must be handled manually by an explicit select on the load's output.
6839 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
6840 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6841 SDValue Load =
6842 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6843 MGT->getMemOperand(), IndexType, ExtType);
6844 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6845 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
6846 }
6847
6848 bool IsScaled = MGT->isIndexScaled();
6849 bool IsSigned = MGT->isIndexSigned();
6850
6851 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6852 // must be calculated before hand.
6853 uint64_t ScaleVal = Scale->getAsZExtVal();
6854 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6855 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6856 EVT IndexVT = Index.getValueType();
6857 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6858 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6859 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6860
6861 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6862 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6863 MGT->getMemOperand(), IndexType, ExtType);
6864 }
6865
6866 // Lower fixed length gather to a scalable equivalent.
6867 if (VT.isFixedLengthVector()) {
6868 assert(Subtarget->useSVEForFixedLengthVectors() &&
6869 "Cannot lower when not using SVE for fixed vectors!");
6870
6871 // NOTE: Handle floating-point as if integer then bitcast the result.
6872 EVT DataVT = VT.changeVectorElementTypeToInteger();
6873 MemVT = MemVT.changeVectorElementTypeToInteger();
6874
6875 // Find the smallest integer fixed length vector we can use for the gather.
6876 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6877 if (DataVT.getVectorElementType() == MVT::i64 ||
6878 Index.getValueType().getVectorElementType() == MVT::i64 ||
6879 Mask.getValueType().getVectorElementType() == MVT::i64)
6880 PromotedVT = VT.changeVectorElementType(MVT::i64);
6881
6882 // Promote vector operands except for passthrough, which we know is either
6883 // undef or zero, and thus best constructed directly.
6884 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6885 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6886 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6887
6888 // A promoted result type forces the need for an extending load.
6889 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6890 ExtType = ISD::EXTLOAD;
6891
6892 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6893
6894 // Convert fixed length vector operands to scalable.
6895 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6896 Index = convertToScalableVector(DAG, ContainerVT, Index);
6898 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
6899 : DAG.getConstant(0, DL, ContainerVT);
6900
6901 // Emit equivalent scalable vector gather.
6902 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6903 SDValue Load =
6904 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
6905 Ops, MGT->getMemOperand(), IndexType, ExtType);
6906
6907 // Extract fixed length data then convert to the required result type.
6908 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
6909 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
6910 if (VT.isFloatingPoint())
6911 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
6912
6913 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6914 }
6915
6916 // Everything else is legal.
6917 return Op;
6918}
6919
6920SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6921 SelectionDAG &DAG) const {
6922 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
6923
6924 SDLoc DL(Op);
6925 SDValue Chain = MSC->getChain();
6926 SDValue StoreVal = MSC->getValue();
6927 SDValue Mask = MSC->getMask();
6928 SDValue BasePtr = MSC->getBasePtr();
6929 SDValue Index = MSC->getIndex();
6930 SDValue Scale = MSC->getScale();
6931 EVT VT = StoreVal.getValueType();
6932 EVT MemVT = MSC->getMemoryVT();
6933 ISD::MemIndexType IndexType = MSC->getIndexType();
6934 bool Truncating = MSC->isTruncatingStore();
6935
6936 bool IsScaled = MSC->isIndexScaled();
6937 bool IsSigned = MSC->isIndexSigned();
6938
6939 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6940 // must be calculated before hand.
6941 uint64_t ScaleVal = Scale->getAsZExtVal();
6942 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6943 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6944 EVT IndexVT = Index.getValueType();
6945 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6946 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6947 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6948
6949 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6950 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6951 MSC->getMemOperand(), IndexType, Truncating);
6952 }
6953
6954 // Lower fixed length scatter to a scalable equivalent.
6955 if (VT.isFixedLengthVector()) {
6956 assert(Subtarget->useSVEForFixedLengthVectors() &&
6957 "Cannot lower when not using SVE for fixed vectors!");
6958
6959 // Once bitcast we treat floating-point scatters as if integer.
6960 if (VT.isFloatingPoint()) {
6962 MemVT = MemVT.changeVectorElementTypeToInteger();
6963 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6964 }
6965
6966 // Find the smallest integer fixed length vector we can use for the scatter.
6967 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6968 if (VT.getVectorElementType() == MVT::i64 ||
6969 Index.getValueType().getVectorElementType() == MVT::i64 ||
6970 Mask.getValueType().getVectorElementType() == MVT::i64)
6971 PromotedVT = VT.changeVectorElementType(MVT::i64);
6972
6973 // Promote vector operands.
6974 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6975 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6976 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6977 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6978
6979 // A promoted value type forces the need for a truncating store.
6980 if (PromotedVT != VT)
6981 Truncating = true;
6982
6983 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6984
6985 // Convert fixed length vector operands to scalable.
6986 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6987 Index = convertToScalableVector(DAG, ContainerVT, Index);
6989 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6990
6991 // Emit equivalent scalable vector scatter.
6992 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6993 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6994 MSC->getMemOperand(), IndexType, Truncating);
6995 }
6996
6997 // Everything else is legal.
6998 return Op;
6999}
7000
7001SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
7002 SDLoc DL(Op);
7003 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
7004 assert(LoadNode && "Expected custom lowering of a masked load node");
7005 EVT VT = Op->getValueType(0);
7006
7007 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
7008 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
7009
7010 SDValue PassThru = LoadNode->getPassThru();
7011 SDValue Mask = LoadNode->getMask();
7012
7013 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
7014 return Op;
7015
7017 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
7018 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
7019 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
7020 LoadNode->getExtensionType());
7021
7022 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
7023
7024 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7025}
7026
7027// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
7029 EVT VT, EVT MemVT,
7030 SelectionDAG &DAG) {
7031 assert(VT.isVector() && "VT should be a vector type");
7032 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
7033
7034 SDValue Value = ST->getValue();
7035
7036 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
7037 // the word lane which represent the v4i8 subvector. It optimizes the store
7038 // to:
7039 //
7040 // xtn v0.8b, v0.8h
7041 // str s0, [x0]
7042
7043 SDValue Undef = DAG.getUNDEF(MVT::i16);
7044 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
7045 {Undef, Undef, Undef, Undef});
7046
7047 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
7048 Value, UndefVec);
7049 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
7050
7051 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
7052 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
7053 Trunc, DAG.getConstant(0, DL, MVT::i64));
7054
7055 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
7056 ST->getBasePtr(), ST->getMemOperand());
7057}
7058
7060 SDLoc DL(Op);
7061 SDValue Src = Op.getOperand(0);
7062 MVT DestVT = Op.getSimpleValueType();
7063 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7065
7066 unsigned SrcAS = N->getSrcAddressSpace();
7067 unsigned DestAS = N->getDestAddressSpace();
7068 assert(SrcAS != DestAS &&
7069 "addrspacecast must be between different address spaces");
7070 assert(TLI.getTargetMachine().getPointerSize(SrcAS) !=
7071 TLI.getTargetMachine().getPointerSize(DestAS) &&
7072 "addrspacecast must be between different ptr sizes");
7073 (void)TLI;
7074
7075 if (SrcAS == ARM64AS::PTR32_SPTR) {
7076 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, Src,
7077 DAG.getTargetConstant(0, DL, DestVT));
7078 } else if (SrcAS == ARM64AS::PTR32_UPTR) {
7079 return DAG.getNode(ISD::ZERO_EXTEND, DL, DestVT, Src,
7080 DAG.getTargetConstant(0, DL, DestVT));
7081 } else if ((DestAS == ARM64AS::PTR32_SPTR) ||
7082 (DestAS == ARM64AS::PTR32_UPTR)) {
7083 SDValue Ext = DAG.getAnyExtOrTrunc(Src, DL, DestVT);
7084 SDValue Trunc = DAG.getZeroExtendInReg(Ext, DL, DestVT);
7085 return Trunc;
7086 } else {
7087 return Src;
7088 }
7089}
7090
7091// Custom lowering for any store, vector or scalar and/or default or with
7092// a truncate operations. Currently only custom lower truncate operation
7093// from vector v4i16 to v4i8 or volatile stores of i128.
7094SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
7095 SelectionDAG &DAG) const {
7096 SDLoc Dl(Op);
7097 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
7098 assert (StoreNode && "Can only custom lower store nodes");
7099
7100 SDValue Value = StoreNode->getValue();
7101
7102 EVT VT = Value.getValueType();
7103 EVT MemVT = StoreNode->getMemoryVT();
7104
7105 if (VT.isVector()) {
7107 VT,
7108 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
7109 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
7110
7111 unsigned AS = StoreNode->getAddressSpace();
7112 Align Alignment = StoreNode->getAlign();
7113 if (Alignment < MemVT.getStoreSize() &&
7114 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
7115 StoreNode->getMemOperand()->getFlags(),
7116 nullptr)) {
7117 return scalarizeVectorStore(StoreNode, DAG);
7118 }
7119
7120 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
7121 MemVT == MVT::v4i8) {
7122 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
7123 }
7124 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
7125 // the custom lowering, as there are no un-paired non-temporal stores and
7126 // legalization will break up 256 bit inputs.
7127 ElementCount EC = MemVT.getVectorElementCount();
7128 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
7129 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
7130 (MemVT.getScalarSizeInBits() == 8u ||
7131 MemVT.getScalarSizeInBits() == 16u ||
7132 MemVT.getScalarSizeInBits() == 32u ||
7133 MemVT.getScalarSizeInBits() == 64u)) {
7134 SDValue Lo =
7137 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
7138 SDValue Hi =
7141 StoreNode->getValue(),
7142 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
7144 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
7145 {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo),
7146 DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()},
7147 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7148 return Result;
7149 }
7150 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
7151 return LowerStore128(Op, DAG);
7152 } else if (MemVT == MVT::i64x8) {
7153 SDValue Value = StoreNode->getValue();
7154 assert(Value->getValueType(0) == MVT::i64x8);
7155 SDValue Chain = StoreNode->getChain();
7156 SDValue Base = StoreNode->getBasePtr();
7157 EVT PtrVT = Base.getValueType();
7158 for (unsigned i = 0; i < 8; i++) {
7159 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
7160 Value, DAG.getConstant(i, Dl, MVT::i32));
7161 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
7162 DAG.getConstant(i * 8, Dl, PtrVT));
7163 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
7164 StoreNode->getBaseAlign());
7165 }
7166 return Chain;
7167 }
7168
7169 return SDValue();
7170}
7171
7172/// Lower atomic or volatile 128-bit stores to a single STP instruction.
7173SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
7174 SelectionDAG &DAG) const {
7175 MemSDNode *StoreNode = cast<MemSDNode>(Op);
7176 assert(StoreNode->getMemoryVT() == MVT::i128);
7177 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
7178
7179 bool IsStoreRelease =
7181 if (StoreNode->isAtomic())
7182 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
7183 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
7186
7187 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
7188 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
7189 ? StoreNode->getOperand(1)
7190 : StoreNode->getOperand(2);
7191 SDLoc DL(Op);
7192 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
7193 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
7194 if (DAG.getDataLayout().isBigEndian())
7195 std::swap(StoreValue.first, StoreValue.second);
7197 Opcode, DL, DAG.getVTList(MVT::Other),
7198 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
7199 StoreNode->getBasePtr()},
7200 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7201 return Result;
7202}
7203
7204SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
7205 SelectionDAG &DAG) const {
7206 SDLoc DL(Op);
7207 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
7208 assert(LoadNode && "Expected custom lowering of a load node");
7209
7210 if (LoadNode->getMemoryVT() == MVT::i64x8) {
7212 SDValue Base = LoadNode->getBasePtr();
7213 SDValue Chain = LoadNode->getChain();
7214 EVT PtrVT = Base.getValueType();
7215 for (unsigned i = 0; i < 8; i++) {
7216 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
7217 DAG.getConstant(i * 8, DL, PtrVT));
7218 SDValue Part =
7219 DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
7220 LoadNode->getBaseAlign());
7221 Ops.push_back(Part);
7222 Chain = SDValue(Part.getNode(), 1);
7223 }
7224 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
7225 return DAG.getMergeValues({Loaded, Chain}, DL);
7226 }
7227
7228 // Custom lowering for extending v4i8 vector loads.
7229 EVT VT = Op->getValueType(0);
7230 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
7231
7232 if (LoadNode->getMemoryVT() != MVT::v4i8)
7233 return SDValue();
7234
7235 // Avoid generating unaligned loads.
7236 if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
7237 return SDValue();
7238
7239 unsigned ExtType;
7240 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
7241 ExtType = ISD::SIGN_EXTEND;
7242 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
7243 LoadNode->getExtensionType() == ISD::EXTLOAD)
7244 ExtType = ISD::ZERO_EXTEND;
7245 else
7246 return SDValue();
7247
7248 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
7249 LoadNode->getBasePtr(), MachinePointerInfo());
7250 SDValue Chain = Load.getValue(1);
7251 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
7252 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
7253 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
7254 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
7255 DAG.getConstant(0, DL, MVT::i64));
7256 if (VT == MVT::v4i32)
7257 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
7258 return DAG.getMergeValues({Ext, Chain}, DL);
7259}
7260
7261SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
7262 SelectionDAG &DAG) const {
7263 SDLoc DL(Op);
7264 SDValue Vec = Op.getOperand(0);
7265 SDValue Mask = Op.getOperand(1);
7266 SDValue Passthru = Op.getOperand(2);
7267 EVT VecVT = Vec.getValueType();
7268 EVT MaskVT = Mask.getValueType();
7269 EVT ElmtVT = VecVT.getVectorElementType();
7270 const bool IsFixedLength = VecVT.isFixedLengthVector();
7271 const bool HasPassthru = !Passthru.isUndef();
7272 unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();
7273 EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7274
7275 assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");
7276
7277 if (!Subtarget->isSVEAvailable())
7278 return SDValue();
7279
7280 if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)
7281 return SDValue();
7282
7283 // Only <vscale x {4|2} x {i32|i64}> supported for compact.
7284 if (MinElmts != 2 && MinElmts != 4)
7285 return SDValue();
7286
7287 // We can use the SVE register containing the NEON vector in its lowest bits.
7288 if (IsFixedLength) {
7289 EVT ScalableVecVT =
7290 MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7291 EVT ScalableMaskVT = MVT::getScalableVectorVT(
7292 MaskVT.getVectorElementType().getSimpleVT(), MinElmts);
7293
7294 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7295 DAG.getUNDEF(ScalableVecVT), Vec,
7296 DAG.getConstant(0, DL, MVT::i64));
7297 Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT,
7298 DAG.getUNDEF(ScalableMaskVT), Mask,
7299 DAG.getConstant(0, DL, MVT::i64));
7301 ScalableMaskVT.changeVectorElementType(MVT::i1), Mask);
7302 Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7303 DAG.getUNDEF(ScalableVecVT), Passthru,
7304 DAG.getConstant(0, DL, MVT::i64));
7305
7306 VecVT = Vec.getValueType();
7307 MaskVT = Mask.getValueType();
7308 }
7309
7310 // Get legal type for compact instruction
7311 EVT ContainerVT = getSVEContainerType(VecVT);
7312 EVT CastVT = VecVT.changeVectorElementTypeToInteger();
7313
7314 // Convert to i32 or i64 for smaller types, as these are the only supported
7315 // sizes for compact.
7316 if (ContainerVT != VecVT) {
7317 Vec = DAG.getBitcast(CastVT, Vec);
7318 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);
7319 }
7320
7321 SDValue Compressed = DAG.getNode(
7323 DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec);
7324
7325 // compact fills with 0s, so if our passthru is all 0s, do nothing here.
7326 if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {
7327 SDValue Offset = DAG.getNode(
7328 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7329 DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask);
7330
7331 SDValue IndexMask = DAG.getNode(
7332 ISD::INTRINSIC_WO_CHAIN, DL, MaskVT,
7333 DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64),
7334 DAG.getConstant(0, DL, MVT::i64), Offset);
7335
7336 Compressed =
7337 DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru);
7338 }
7339
7340 // Extracting from a legal SVE type before truncating produces better code.
7341 if (IsFixedLength) {
7342 Compressed = DAG.getNode(
7344 FixedVecVT.changeVectorElementType(ContainerVT.getVectorElementType()),
7345 Compressed, DAG.getConstant(0, DL, MVT::i64));
7346 CastVT = FixedVecVT.changeVectorElementTypeToInteger();
7347 VecVT = FixedVecVT;
7348 }
7349
7350 // If we changed the element type before, we need to convert it back.
7351 if (ContainerVT != VecVT) {
7352 Compressed = DAG.getNode(ISD::TRUNCATE, DL, CastVT, Compressed);
7353 Compressed = DAG.getBitcast(VecVT, Compressed);
7354 }
7355
7356 return Compressed;
7357}
7358
7359// Generate SUBS and CSEL for integer abs.
7360SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7361 MVT VT = Op.getSimpleValueType();
7362
7363 if (VT.isVector())
7364 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
7365
7366 SDLoc DL(Op);
7367 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, VT);
7368
7369 // Generate SUBS & CSEL.
7370 SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
7371 Op.getOperand(0), DAG.getConstant(0, DL, VT));
7372 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7373 getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1));
7374}
7375
7377 SDValue Chain = Op.getOperand(0);
7378 SDValue Cond = Op.getOperand(1);
7379 SDValue Dest = Op.getOperand(2);
7380
7382 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
7383 SDLoc DL(Op);
7384 SDValue CCVal = getCondCode(DAG, CC);
7385 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
7386 Cmp);
7387 }
7388
7389 return SDValue();
7390}
7391
7392// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7393// FSHL is converted to FSHR before deciding what to do with it
7395 SDValue Shifts = Op.getOperand(2);
7396 // Check if the shift amount is a constant and normalise to [0, SrcBitLen)
7397 // If opcode is FSHL, convert it to FSHR
7398 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
7399 SDLoc DL(Op);
7400 MVT VT = Op.getSimpleValueType();
7401 unsigned int NewShiftNo = ShiftNo->getZExtValue() % VT.getFixedSizeInBits();
7402
7403 if (Op.getOpcode() == ISD::FSHL) {
7404 if (NewShiftNo == 0)
7405 return Op.getOperand(0);
7406
7407 NewShiftNo = VT.getFixedSizeInBits() - NewShiftNo;
7408 return DAG.getNode(
7409 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7410 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7411 }
7412
7413 if (Op.getOpcode() == ISD::FSHR) {
7414 if (NewShiftNo == 0)
7415 return Op.getOperand(1);
7416
7417 if (ShiftNo->getZExtValue() == NewShiftNo)
7418 return Op;
7419
7420 // Rewrite using the normalised shift amount.
7421 return DAG.getNode(
7422 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7423 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7424 }
7425 }
7426
7427 return SDValue();
7428}
7429
7431 SDValue X = Op.getOperand(0);
7432 EVT XScalarTy = X.getValueType();
7433 SDValue Exp = Op.getOperand(1);
7434
7435 SDLoc DL(Op);
7436 EVT XVT, ExpVT;
7437 switch (Op.getSimpleValueType().SimpleTy) {
7438 default:
7439 return SDValue();
7440 case MVT::bf16:
7441 case MVT::f16:
7442 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
7443 [[fallthrough]];
7444 case MVT::f32:
7445 XVT = MVT::nxv4f32;
7446 ExpVT = MVT::nxv4i32;
7447 break;
7448 case MVT::f64:
7449 XVT = MVT::nxv2f64;
7450 ExpVT = MVT::nxv2i64;
7451 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
7452 break;
7453 }
7454
7455 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7456 SDValue VX =
7457 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
7458 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
7459 DAG.getUNDEF(ExpVT), Exp, Zero);
7460 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
7461 AArch64SVEPredPattern::all);
7462 SDValue FScale =
7464 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
7465 VPg, VX, VExp);
7466 SDValue Final =
7467 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
7468 if (X.getValueType() != XScalarTy)
7469 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
7470 DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));
7471 return Final;
7472}
7473
7474SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7475 SelectionDAG &DAG) const {
7476 return Op.getOperand(0);
7477}
7478
7479SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7480 SelectionDAG &DAG) const {
7481 SDValue Chain = Op.getOperand(0);
7482 SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes
7483 SDValue FPtr = Op.getOperand(2); // nested function
7484 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7485
7486 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7487
7488 // ldr NestReg, .+16
7489 // ldr x17, .+20
7490 // br x17
7491 // .word 0
7492 // .nest: .qword nest
7493 // .fptr: .qword fptr
7494 SDValue OutChains[5];
7495
7496 const Function *Func =
7497 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7498 CallingConv::ID CC = Func->getCallingConv();
7499 unsigned NestReg;
7500
7501 switch (CC) {
7502 default:
7503 NestReg = 0x0f; // X15
7504 break;
7506 // Must be kept in sync with AArch64CallingConv.td
7507 NestReg = 0x04; // X4
7508 break;
7509 }
7510
7511 const char FptrReg = 0x11; // X17
7512
7513 SDValue Addr = Trmp;
7514
7515 SDLoc DL(Op);
7516 OutChains[0] = DAG.getStore(
7517 Chain, DL, DAG.getConstant(0x58000080u | NestReg, DL, MVT::i32), Addr,
7518 MachinePointerInfo(TrmpAddr));
7519
7520 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7521 DAG.getConstant(4, DL, MVT::i64));
7522 OutChains[1] = DAG.getStore(
7523 Chain, DL, DAG.getConstant(0x580000b0u | FptrReg, DL, MVT::i32), Addr,
7524 MachinePointerInfo(TrmpAddr, 4));
7525
7526 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7527 DAG.getConstant(8, DL, MVT::i64));
7528 OutChains[2] =
7529 DAG.getStore(Chain, DL, DAG.getConstant(0xd61f0220u, DL, MVT::i32), Addr,
7530 MachinePointerInfo(TrmpAddr, 8));
7531
7532 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7533 DAG.getConstant(16, DL, MVT::i64));
7534 OutChains[3] =
7535 DAG.getStore(Chain, DL, Nest, Addr, MachinePointerInfo(TrmpAddr, 16));
7536
7537 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7538 DAG.getConstant(24, DL, MVT::i64));
7539 OutChains[4] =
7540 DAG.getStore(Chain, DL, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24));
7541
7542 SDValue StoreToken = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
7543
7544 SDValue EndOfTrmp = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7545 DAG.getConstant(12, DL, MVT::i64));
7546
7547 // Call clear cache on the trampoline instructions.
7548 return DAG.getNode(ISD::CLEAR_CACHE, DL, MVT::Other, StoreToken, Trmp,
7549 EndOfTrmp);
7550}
7551
7553 SelectionDAG &DAG) const {
7554 LLVM_DEBUG(dbgs() << "Custom lowering: ");
7555 LLVM_DEBUG(Op.dump());
7556
7557 switch (Op.getOpcode()) {
7558 default:
7559 llvm_unreachable("unimplemented operand");
7560 return SDValue();
7563 return LowerLOOP_DEPENDENCE_MASK(Op, DAG);
7564 case ISD::BITCAST:
7565 return LowerBITCAST(Op, DAG);
7566 case ISD::GlobalAddress:
7567 return LowerGlobalAddress(Op, DAG);
7569 return LowerGlobalTLSAddress(Op, DAG);
7571 return LowerPtrAuthGlobalAddress(Op, DAG);
7572 case ISD::ADJUST_TRAMPOLINE:
7573 return LowerADJUST_TRAMPOLINE(Op, DAG);
7574 case ISD::INIT_TRAMPOLINE:
7575 return LowerINIT_TRAMPOLINE(Op, DAG);
7576 case ISD::SETCC:
7577 case ISD::STRICT_FSETCC:
7579 return LowerSETCC(Op, DAG);
7580 case ISD::SETCCCARRY:
7581 return LowerSETCCCARRY(Op, DAG);
7582 case ISD::BRCOND:
7583 return LowerBRCOND(Op, DAG);
7584 case ISD::BR_CC:
7585 return LowerBR_CC(Op, DAG);
7586 case ISD::SELECT:
7587 return LowerSELECT(Op, DAG);
7588 case ISD::SELECT_CC:
7589 return LowerSELECT_CC(Op, DAG);
7590 case ISD::JumpTable:
7591 return LowerJumpTable(Op, DAG);
7592 case ISD::BR_JT:
7593 return LowerBR_JT(Op, DAG);
7594 case ISD::BRIND:
7595 return LowerBRIND(Op, DAG);
7596 case ISD::ConstantPool:
7597 return LowerConstantPool(Op, DAG);
7598 case ISD::BlockAddress:
7599 return LowerBlockAddress(Op, DAG);
7600 case ISD::VASTART:
7601 return LowerVASTART(Op, DAG);
7602 case ISD::VACOPY:
7603 return LowerVACOPY(Op, DAG);
7604 case ISD::VAARG:
7605 return LowerVAARG(Op, DAG);
7606 case ISD::UADDO_CARRY:
7607 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
7608 case ISD::USUBO_CARRY:
7609 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
7610 case ISD::SADDO_CARRY:
7611 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
7612 case ISD::SSUBO_CARRY:
7613 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
7614 case ISD::SADDO:
7615 case ISD::UADDO:
7616 case ISD::SSUBO:
7617 case ISD::USUBO:
7618 case ISD::SMULO:
7619 case ISD::UMULO:
7620 return LowerXALUO(Op, DAG);
7621 case ISD::FADD:
7622 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
7623 case ISD::FSUB:
7624 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
7625 case ISD::FMUL:
7626 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
7627 case ISD::FMA:
7628 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
7629 case ISD::FDIV:
7630 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
7631 case ISD::FNEG:
7632 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
7633 case ISD::FCEIL:
7634 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
7635 case ISD::FFLOOR:
7636 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
7637 case ISD::FNEARBYINT:
7638 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
7639 case ISD::FRINT:
7640 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
7641 case ISD::FROUND:
7642 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
7643 case ISD::FROUNDEVEN:
7644 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
7645 case ISD::FTRUNC:
7646 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
7647 case ISD::FSQRT:
7648 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
7649 case ISD::FABS:
7650 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
7651 case ISD::FP_ROUND:
7653 return LowerFP_ROUND(Op, DAG);
7654 case ISD::FP_EXTEND:
7656 return LowerFP_EXTEND(Op, DAG);
7657 case ISD::FRAMEADDR:
7658 return LowerFRAMEADDR(Op, DAG);
7659 case ISD::SPONENTRY:
7660 return LowerSPONENTRY(Op, DAG);
7661 case ISD::RETURNADDR:
7662 return LowerRETURNADDR(Op, DAG);
7664 return LowerADDROFRETURNADDR(Op, DAG);
7666 return LowerCONCAT_VECTORS(Op, DAG);
7668 return LowerINSERT_VECTOR_ELT(Op, DAG);
7670 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7671 case ISD::BUILD_VECTOR:
7672 return LowerBUILD_VECTOR(Op, DAG);
7674 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
7676 return LowerVECTOR_SHUFFLE(Op, DAG);
7677 case ISD::SPLAT_VECTOR:
7678 return LowerSPLAT_VECTOR(Op, DAG);
7680 return LowerEXTRACT_SUBVECTOR(Op, DAG);
7682 return LowerINSERT_SUBVECTOR(Op, DAG);
7683 case ISD::SDIV:
7684 case ISD::UDIV:
7685 return LowerDIV(Op, DAG);
7686 case ISD::SMIN:
7687 case ISD::UMIN:
7688 case ISD::SMAX:
7689 case ISD::UMAX:
7690 return LowerMinMax(Op, DAG);
7691 case ISD::SRA:
7692 case ISD::SRL:
7693 case ISD::SHL:
7694 return LowerVectorSRA_SRL_SHL(Op, DAG);
7695 case ISD::SHL_PARTS:
7696 case ISD::SRL_PARTS:
7697 case ISD::SRA_PARTS:
7698 return LowerShiftParts(Op, DAG);
7699 case ISD::CTPOP:
7700 case ISD::PARITY:
7701 return LowerCTPOP_PARITY(Op, DAG);
7702 case ISD::FCOPYSIGN:
7703 return LowerFCOPYSIGN(Op, DAG);
7704 case ISD::OR:
7705 return LowerVectorOR(Op, DAG);
7706 case ISD::XOR:
7707 return LowerXOR(Op, DAG);
7708 case ISD::PREFETCH:
7709 return LowerPREFETCH(Op, DAG);
7710 case ISD::SINT_TO_FP:
7711 case ISD::UINT_TO_FP:
7714 return LowerINT_TO_FP(Op, DAG);
7715 case ISD::FP_TO_SINT:
7716 case ISD::FP_TO_UINT:
7719 return LowerFP_TO_INT(Op, DAG);
7722 return LowerFP_TO_INT_SAT(Op, DAG);
7723 case ISD::FSINCOS:
7724 return LowerFSINCOS(Op, DAG);
7725 case ISD::GET_ROUNDING:
7726 return LowerGET_ROUNDING(Op, DAG);
7727 case ISD::SET_ROUNDING:
7728 return LowerSET_ROUNDING(Op, DAG);
7729 case ISD::GET_FPMODE:
7730 return LowerGET_FPMODE(Op, DAG);
7731 case ISD::SET_FPMODE:
7732 return LowerSET_FPMODE(Op, DAG);
7733 case ISD::RESET_FPMODE:
7734 return LowerRESET_FPMODE(Op, DAG);
7735 case ISD::MUL:
7736 return LowerMUL(Op, DAG);
7737 case ISD::MULHS:
7738 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
7739 case ISD::MULHU:
7740 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
7742 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7744 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7746 return LowerINTRINSIC_VOID(Op, DAG);
7747 case ISD::ATOMIC_STORE:
7748 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
7749 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
7750 return LowerStore128(Op, DAG);
7751 }
7752 return SDValue();
7753 case ISD::STORE:
7754 return LowerSTORE(Op, DAG);
7755 case ISD::MSTORE:
7756 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
7757 case ISD::MGATHER:
7758 return LowerMGATHER(Op, DAG);
7759 case ISD::MSCATTER:
7760 return LowerMSCATTER(Op, DAG);
7761 case ISD::VECREDUCE_SEQ_FADD:
7762 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
7763 case ISD::VECREDUCE_ADD:
7764 case ISD::VECREDUCE_AND:
7765 case ISD::VECREDUCE_OR:
7766 case ISD::VECREDUCE_XOR:
7767 case ISD::VECREDUCE_SMAX:
7768 case ISD::VECREDUCE_SMIN:
7769 case ISD::VECREDUCE_UMAX:
7770 case ISD::VECREDUCE_UMIN:
7771 case ISD::VECREDUCE_FADD:
7772 case ISD::VECREDUCE_FMAX:
7773 case ISD::VECREDUCE_FMIN:
7774 case ISD::VECREDUCE_FMAXIMUM:
7775 case ISD::VECREDUCE_FMINIMUM:
7776 return LowerVECREDUCE(Op, DAG);
7777 case ISD::ATOMIC_LOAD_AND:
7778 return LowerATOMIC_LOAD_AND(Op, DAG);
7779 case ISD::DYNAMIC_STACKALLOC:
7780 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7781 case ISD::VSCALE:
7782 return LowerVSCALE(Op, DAG);
7784 return LowerVECTOR_COMPRESS(Op, DAG);
7785 case ISD::ANY_EXTEND:
7786 case ISD::SIGN_EXTEND:
7787 case ISD::ZERO_EXTEND:
7788 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
7789 case ISD::ADDRSPACECAST:
7790 return LowerADDRSPACECAST(Op, DAG);
7792 // Only custom lower when ExtraVT has a legal byte based element type.
7793 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
7794 EVT ExtraEltVT = ExtraVT.getVectorElementType();
7795 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
7796 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
7797 return SDValue();
7798
7799 return LowerToPredicatedOp(Op, DAG,
7800 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
7801 }
7802 case ISD::TRUNCATE:
7803 return LowerTRUNCATE(Op, DAG);
7804 case ISD::MLOAD:
7805 return LowerMLOAD(Op, DAG);
7806 case ISD::LOAD:
7807 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
7808 !Subtarget->isNeonAvailable()))
7809 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
7810 return LowerLOAD(Op, DAG);
7811 case ISD::ADD:
7812 case ISD::AND:
7813 case ISD::SUB:
7814 return LowerToScalableOp(Op, DAG);
7815 case ISD::FMAXIMUM:
7816 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
7817 case ISD::FMAXNUM:
7818 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
7819 case ISD::FMINIMUM:
7820 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
7821 case ISD::FMINNUM:
7822 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
7823 case ISD::VSELECT:
7824 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
7825 case ISD::ABS:
7826 return LowerABS(Op, DAG);
7827 case ISD::ABDS:
7828 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
7829 case ISD::ABDU:
7830 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
7831 case ISD::AVGFLOORS:
7832 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
7833 case ISD::AVGFLOORU:
7834 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
7835 case ISD::AVGCEILS:
7836 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
7837 case ISD::AVGCEILU:
7838 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
7839 case ISD::BITREVERSE:
7840 return LowerBitreverse(Op, DAG);
7841 case ISD::BSWAP:
7842 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
7843 case ISD::CTLZ:
7844 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
7845 case ISD::CTTZ:
7846 return LowerCTTZ(Op, DAG);
7847 case ISD::VECTOR_SPLICE:
7848 return LowerVECTOR_SPLICE(Op, DAG);
7850 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
7852 return LowerVECTOR_INTERLEAVE(Op, DAG);
7853 case ISD::GET_ACTIVE_LANE_MASK:
7854 return LowerGET_ACTIVE_LANE_MASK(Op, DAG);
7855 case ISD::LRINT:
7856 case ISD::LLRINT:
7857 if (Op.getValueType().isVector())
7858 return LowerVectorXRINT(Op, DAG);
7859 [[fallthrough]];
7860 case ISD::LROUND:
7861 case ISD::LLROUND: {
7862 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
7863 Op.getOperand(0).getValueType() == MVT::bf16) &&
7864 "Expected custom lowering of rounding operations only for f16");
7865 SDLoc DL(Op);
7866 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
7867 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
7868 }
7869 case ISD::STRICT_LROUND:
7871 case ISD::STRICT_LRINT:
7872 case ISD::STRICT_LLRINT: {
7873 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
7874 Op.getOperand(1).getValueType() == MVT::bf16) &&
7875 "Expected custom lowering of rounding operations only for f16");
7876 SDLoc DL(Op);
7877 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
7878 {Op.getOperand(0), Op.getOperand(1)});
7879 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
7880 {Ext.getValue(1), Ext.getValue(0)});
7881 }
7882 case ISD::WRITE_REGISTER: {
7883 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
7884 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
7885 SDLoc DL(Op);
7886
7887 SDValue Chain = Op.getOperand(0);
7888 SDValue SysRegName = Op.getOperand(1);
7889 std::pair<SDValue, SDValue> Pair =
7890 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
7891
7892 // chain = MSRR(chain, sysregname, lo, hi)
7893 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
7894 SysRegName, Pair.first, Pair.second);
7895
7896 return Result;
7897 }
7898 case ISD::FSHL:
7899 case ISD::FSHR:
7900 return LowerFunnelShift(Op, DAG);
7901 case ISD::FLDEXP:
7902 return LowerFLDEXP(Op, DAG);
7903 case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
7904 return LowerVECTOR_HISTOGRAM(Op, DAG);
7905 case ISD::PARTIAL_REDUCE_SMLA:
7906 case ISD::PARTIAL_REDUCE_UMLA:
7907 case ISD::PARTIAL_REDUCE_SUMLA:
7908 return LowerPARTIAL_REDUCE_MLA(Op, DAG);
7909 }
7910}
7911
7913 return !Subtarget->useSVEForFixedLengthVectors();
7914}
7915
7917 EVT VT, bool OverrideNEON) const {
7918 if (!VT.isFixedLengthVector() || !VT.isSimple())
7919 return false;
7920
7921 // Don't use SVE for vectors we cannot scalarize if required.
7922 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7923 // Fixed length predicates should be promoted to i8.
7924 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7925 case MVT::i1:
7926 default:
7927 return false;
7928 case MVT::i8:
7929 case MVT::i16:
7930 case MVT::i32:
7931 case MVT::i64:
7932 case MVT::f16:
7933 case MVT::f32:
7934 case MVT::f64:
7935 break;
7936 }
7937
7938 // NEON-sized vectors can be emulated using SVE instructions.
7939 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
7940 return Subtarget->isSVEorStreamingSVEAvailable();
7941
7942 // Ensure NEON MVTs only belong to a single register class.
7943 if (VT.getFixedSizeInBits() <= 128)
7944 return false;
7945
7946 // Ensure wider than NEON code generation is enabled.
7947 if (!Subtarget->useSVEForFixedLengthVectors())
7948 return false;
7949
7950 // Don't use SVE for types that don't fit.
7951 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7952 return false;
7953
7954 // TODO: Perhaps an artificial restriction, but worth having whilst getting
7955 // the base fixed length SVE support in place.
7956 if (!VT.isPow2VectorType())
7957 return false;
7958
7959 return true;
7960}
7961
7962//===----------------------------------------------------------------------===//
7963// Calling Convention Implementation
7964//===----------------------------------------------------------------------===//
7965
7966static unsigned getIntrinsicID(const SDNode *N) {
7967 unsigned Opcode = N->getOpcode();
7968 switch (Opcode) {
7969 default:
7972 unsigned IID = N->getConstantOperandVal(0);
7973 if (IID < Intrinsic::num_intrinsics)
7974 return IID;
7976 }
7977 }
7978}
7979
7981 SDValue N1) const {
7982 if (!N0.hasOneUse())
7983 return false;
7984
7985 unsigned IID = getIntrinsicID(N1.getNode());
7986 // Avoid reassociating expressions that can be lowered to smlal/umlal.
7987 if (IID == Intrinsic::aarch64_neon_umull ||
7988 N1.getOpcode() == AArch64ISD::UMULL ||
7989 IID == Intrinsic::aarch64_neon_smull ||
7990 N1.getOpcode() == AArch64ISD::SMULL)
7991 return N0.getOpcode() != ISD::ADD;
7992
7993 return true;
7994}
7995
7996/// Selects the correct CCAssignFn for a given CallingConvention value.
7998 bool IsVarArg) const {
7999 switch (CC) {
8000 default:
8001 reportFatalUsageError("unsupported calling convention");
8002 case CallingConv::GHC:
8003 return CC_AArch64_GHC;
8005 // The VarArg implementation makes assumptions about register
8006 // argument passing that do not hold for preserve_none, so we
8007 // instead fall back to C argument passing.
8008 // The non-vararg case is handled in the CC function itself.
8009 if (!IsVarArg)
8011 [[fallthrough]];
8012 case CallingConv::C:
8013 case CallingConv::Fast:
8017 case CallingConv::Swift:
8019 case CallingConv::Tail:
8020 case CallingConv::GRAAL:
8021 if (Subtarget->isTargetWindows()) {
8022 if (IsVarArg) {
8023 if (Subtarget->isWindowsArm64EC())
8026 }
8027 return CC_AArch64_Win64PCS;
8028 }
8029 if (!Subtarget->isTargetDarwin())
8030 return CC_AArch64_AAPCS;
8031 if (!IsVarArg)
8032 return CC_AArch64_DarwinPCS;
8033 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
8035 case CallingConv::Win64:
8036 if (IsVarArg) {
8037 if (Subtarget->isWindowsArm64EC())
8040 }
8041 return CC_AArch64_Win64PCS;
8043 if (Subtarget->isWindowsArm64EC())
8051 return CC_AArch64_AAPCS;
8056 }
8057}
8058
8059CCAssignFn *
8061 switch (CC) {
8062 default:
8063 return RetCC_AArch64_AAPCS;
8067 if (Subtarget->isWindowsArm64EC())
8069 return RetCC_AArch64_AAPCS;
8070 }
8071}
8072
8073static bool isPassedInFPR(EVT VT) {
8074 return VT.isFixedLengthVector() ||
8075 (VT.isFloatingPoint() && !VT.isScalableVector());
8076}
8077
8079 AArch64FunctionInfo &FuncInfo,
8080 SelectionDAG &DAG) {
8081 if (!FuncInfo.hasZT0SpillSlotIndex())
8082 FuncInfo.setZT0SpillSlotIndex(MFI.CreateSpillStackObject(64, Align(16)));
8083
8084 return DAG.getFrameIndex(
8085 FuncInfo.getZT0SpillSlotIndex(),
8087}
8088
8089SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
8090 SelectionDAG &DAG) const {
8091 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8092 SDValue Glue = Chain.getValue(1);
8093
8094 MachineFunction &MF = DAG.getMachineFunction();
8095 SMEAttrs SMEFnAttrs = MF.getInfo<AArch64FunctionInfo>()->getSMEFnAttrs();
8096
8097 // The following conditions are true on entry to an exception handler:
8098 // - PSTATE.SM is 0.
8099 // - PSTATE.ZA is 0.
8100 // - TPIDR2_EL0 is null.
8101 // See:
8102 // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#exceptions
8103 //
8104 // Therefore, if the function that contains this exception handler is a
8105 // streaming[-compatible] function, we must re-enable streaming mode.
8106 //
8107 // These mode changes are usually optimized away in catch blocks as they
8108 // occur before the __cxa_begin_catch (which is a non-streaming function),
8109 // but are necessary in some cases (such as for cleanups).
8110
8111 if (SMEFnAttrs.hasStreamingInterfaceOrBody())
8112 return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
8113 /*Glue*/ Glue, AArch64SME::Always);
8114
8115 if (SMEFnAttrs.hasStreamingCompatibleInterface())
8116 return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
8118
8119 return Chain;
8120}
8121
8122SDValue AArch64TargetLowering::LowerFormalArguments(
8123 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
8124 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
8125 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
8126 MachineFunction &MF = DAG.getMachineFunction();
8127 const Function &F = MF.getFunction();
8128 MachineFrameInfo &MFI = MF.getFrameInfo();
8129 bool IsWin64 =
8130 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8131 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
8132 (isVarArg && Subtarget->isWindowsArm64EC());
8133 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8134
8136 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
8138 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
8139 FuncInfo->setIsSVECC(true);
8140
8141 // Assign locations to all of the incoming arguments.
8143 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
8144
8145 // At this point, Ins[].VT may already be promoted to i32. To correctly
8146 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
8147 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
8148 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
8149 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
8150 // LocVT.
8151 unsigned NumArgs = Ins.size();
8152 Function::const_arg_iterator CurOrigArg = F.arg_begin();
8153 unsigned CurArgIdx = 0;
8154 bool UseVarArgCC = false;
8155 if (IsWin64)
8156 UseVarArgCC = isVarArg;
8157
8158 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
8159
8160 for (unsigned i = 0; i != NumArgs; ++i) {
8161 MVT ValVT = Ins[i].VT;
8162 if (Ins[i].isOrigArg()) {
8163 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
8164 CurArgIdx = Ins[i].getOrigArgIndex();
8165
8166 // Get type of the original argument.
8167 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
8168 /*AllowUnknown*/ true);
8169 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
8170 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8171 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8172 ValVT = MVT::i8;
8173 else if (ActualMVT == MVT::i16)
8174 ValVT = MVT::i16;
8175 }
8176 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags,
8177 Ins[i].OrigTy, CCInfo);
8178 assert(!Res && "Call operand has unhandled type");
8179 (void)Res;
8180 }
8181
8182 SMEAttrs Attrs = FuncInfo->getSMEFnAttrs();
8183 bool IsLocallyStreaming =
8184 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
8185 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8186 SDValue Glue = Chain.getValue(1);
8187
8188 unsigned ExtraArgLocs = 0;
8189 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
8190 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8191
8192 if (Ins[i].Flags.isByVal()) {
8193 // Byval is used for HFAs in the PCS, but the system should work in a
8194 // non-compliant manner for larger structs.
8195 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8196 int Size = Ins[i].Flags.getByValSize();
8197 unsigned NumRegs = (Size + 7) / 8;
8198
8199 // FIXME: This works on big-endian for composite byvals, which are the common
8200 // case. It should also work for fundamental types too.
8201 unsigned FrameIdx =
8202 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
8203 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
8204 InVals.push_back(FrameIdxN);
8205
8206 continue;
8207 }
8208
8209 if (Ins[i].Flags.isSwiftAsync())
8210 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
8211
8212 SDValue ArgValue;
8213 if (VA.isRegLoc()) {
8214 // Arguments stored in registers.
8215 EVT RegVT = VA.getLocVT();
8216 const TargetRegisterClass *RC;
8217
8218 if (RegVT == MVT::i32)
8219 RC = &AArch64::GPR32RegClass;
8220 else if (RegVT == MVT::i64)
8221 RC = &AArch64::GPR64RegClass;
8222 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
8223 RC = &AArch64::FPR16RegClass;
8224 else if (RegVT == MVT::f32)
8225 RC = &AArch64::FPR32RegClass;
8226 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
8227 RC = &AArch64::FPR64RegClass;
8228 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
8229 RC = &AArch64::FPR128RegClass;
8230 else if (RegVT.isScalableVector() &&
8231 RegVT.getVectorElementType() == MVT::i1) {
8232 FuncInfo->setIsSVECC(true);
8233 RC = &AArch64::PPRRegClass;
8234 } else if (RegVT == MVT::aarch64svcount) {
8235 FuncInfo->setIsSVECC(true);
8236 RC = &AArch64::PPRRegClass;
8237 } else if (RegVT.isScalableVector()) {
8238 FuncInfo->setIsSVECC(true);
8239 RC = &AArch64::ZPRRegClass;
8240 } else
8241 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
8242
8243 // Transform the arguments in physical registers into virtual ones.
8244 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
8245
8246 if (IsLocallyStreaming) {
8247 // LocallyStreamingFunctions must insert the SMSTART in the correct
8248 // position, so we use Glue to ensure no instructions can be scheduled
8249 // between the chain of:
8250 // t0: ch,glue = EntryNode
8251 // t1: res,ch,glue = CopyFromReg
8252 // ...
8253 // tn: res,ch,glue = CopyFromReg t(n-1), ..
8254 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
8255 // ^^^^^^
8256 // This will be the new Chain/Root node.
8257 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
8258 Glue = ArgValue.getValue(2);
8259 if (isPassedInFPR(ArgValue.getValueType())) {
8260 ArgValue =
8261 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8262 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
8263 {ArgValue, Glue});
8264 Glue = ArgValue.getValue(1);
8265 }
8266 } else
8267 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
8268
8269 // If this is an 8, 16 or 32-bit value, it is really passed promoted
8270 // to 64 bits. Insert an assert[sz]ext to capture this, then
8271 // truncate to the right size.
8272 switch (VA.getLocInfo()) {
8273 default:
8274 llvm_unreachable("Unknown loc info!");
8275 case CCValAssign::Full:
8276 break;
8278 assert(
8279 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8280 "Indirect arguments should be scalable on most subtargets");
8281 break;
8282 case CCValAssign::BCvt:
8283 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
8284 break;
8285 case CCValAssign::AExt:
8286 case CCValAssign::SExt:
8287 case CCValAssign::ZExt:
8288 break;
8290 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
8291 DAG.getConstant(32, DL, RegVT));
8292 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
8293 break;
8294 }
8295 } else { // VA.isRegLoc()
8296 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
8297 unsigned ArgOffset = VA.getLocMemOffset();
8298 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
8299 ? VA.getLocVT().getSizeInBits()
8300 : VA.getValVT().getSizeInBits()) / 8;
8301
8302 uint32_t BEAlign = 0;
8303 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
8304 !Ins[i].Flags.isInConsecutiveRegs())
8305 BEAlign = 8 - ArgSize;
8306
8307 SDValue FIN;
8308 MachinePointerInfo PtrInfo;
8309 if (StackViaX4) {
8310 // In both the ARM64EC varargs convention and the thunk convention,
8311 // arguments on the stack are accessed relative to x4, not sp. In
8312 // the thunk convention, there's an additional offset of 32 bytes
8313 // to account for the shadow store.
8314 unsigned ObjOffset = ArgOffset + BEAlign;
8315 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8316 ObjOffset += 32;
8317 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8318 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8319 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
8320 DAG.getConstant(ObjOffset, DL, MVT::i64));
8322 } else {
8323 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
8324
8325 // Create load nodes to retrieve arguments from the stack.
8326 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
8327 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
8328 }
8329
8330 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
8332 MVT MemVT = VA.getValVT();
8333
8334 switch (VA.getLocInfo()) {
8335 default:
8336 break;
8337 case CCValAssign::Trunc:
8338 case CCValAssign::BCvt:
8339 MemVT = VA.getLocVT();
8340 break;
8343 Subtarget->isWindowsArm64EC()) &&
8344 "Indirect arguments should be scalable on most subtargets");
8345 MemVT = VA.getLocVT();
8346 break;
8347 case CCValAssign::SExt:
8348 ExtType = ISD::SEXTLOAD;
8349 break;
8350 case CCValAssign::ZExt:
8351 ExtType = ISD::ZEXTLOAD;
8352 break;
8353 case CCValAssign::AExt:
8354 ExtType = ISD::EXTLOAD;
8355 break;
8356 }
8357
8358 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
8359 MemVT);
8360 }
8361
8362 if (VA.getLocInfo() == CCValAssign::Indirect) {
8363 assert((VA.getValVT().isScalableVT() ||
8364 Subtarget->isWindowsArm64EC()) &&
8365 "Indirect arguments should be scalable on most subtargets");
8366
8367 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
8368 unsigned NumParts = 1;
8369 if (Ins[i].Flags.isInConsecutiveRegs()) {
8370 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8371 ++NumParts;
8372 }
8373
8374 MVT PartLoad = VA.getValVT();
8375 SDValue Ptr = ArgValue;
8376
8377 // Ensure we generate all loads for each tuple part, whilst updating the
8378 // pointer after each load correctly using vscale.
8379 while (NumParts > 0) {
8380 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
8381 InVals.push_back(ArgValue);
8382 NumParts--;
8383 if (NumParts > 0) {
8384 SDValue BytesIncrement;
8385 if (PartLoad.isScalableVector()) {
8386 BytesIncrement = DAG.getVScale(
8387 DL, Ptr.getValueType(),
8388 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8389 } else {
8390 BytesIncrement = DAG.getConstant(
8391 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8392 Ptr.getValueType());
8393 }
8394 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8395 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
8396 ExtraArgLocs++;
8397 i++;
8398 }
8399 }
8400 } else {
8401 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
8402 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
8403 ArgValue, DAG.getValueType(MVT::i32));
8404
8405 // i1 arguments are zero-extended to i8 by the caller. Emit a
8406 // hint to reflect this.
8407 if (Ins[i].isOrigArg()) {
8408 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
8409 if (OrigArg->getType()->isIntegerTy(1)) {
8410 if (!Ins[i].Flags.isZExt()) {
8411 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
8412 ArgValue.getValueType(), ArgValue);
8413 }
8414 }
8415 }
8416
8417 InVals.push_back(ArgValue);
8418 }
8419 }
8420 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
8421
8422 if (Attrs.hasStreamingCompatibleInterface()) {
8423 SDValue EntryPStateSM =
8424 DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL,
8425 DAG.getVTList(MVT::i64, MVT::Other), {Chain});
8426
8427 // Copy the value to a virtual register, and save that in FuncInfo.
8428 Register EntryPStateSMReg =
8429 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8430 Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg,
8431 EntryPStateSM);
8432 FuncInfo->setPStateSMReg(EntryPStateSMReg);
8433 }
8434
8435 // Insert the SMSTART if this is a locally streaming function and
8436 // make sure it is Glued to the last CopyFromReg value.
8437 if (IsLocallyStreaming) {
8438 if (Attrs.hasStreamingCompatibleInterface())
8439 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8441 else
8442 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8444
8445 // Ensure that the SMSTART happens after the CopyWithChain such that its
8446 // chain result is used.
8447 for (unsigned I=0; I<InVals.size(); ++I) {
8450 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
8451 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
8452 InVals[I].getValueType());
8453 }
8454 }
8455
8456 // varargs
8457 if (isVarArg) {
8459 if (!Subtarget->isTargetDarwin() || IsWin64) {
8460 // The AAPCS variadic function ABI is identical to the non-variadic
8461 // one. As a result there may be more arguments in registers and we
8462 // should save them for future reference.
8463 // Win64 variadic functions also pass arguments in registers, but all
8464 // float arguments are passed in integer registers.
8465 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
8466 }
8467
8468 // This will point to the next argument passed via stack.
8469 unsigned VarArgsOffset = CCInfo.getStackSize();
8470 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
8471 VarArgsOffset =
8472 alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
8473 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
8474 FuncInfo->setVarArgsStackIndex(
8475 MFI.CreateFixedObject(4, VarArgsOffset, true));
8476 }
8477
8478 if (MFI.hasMustTailInVarArgFunc()) {
8479 SmallVector<MVT, 2> RegParmTypes;
8480 RegParmTypes.push_back(MVT::i64);
8481 RegParmTypes.push_back(MVT::f128);
8482 // Compute the set of forwarded registers. The rest are scratch.
8483 SmallVectorImpl<ForwardedRegister> &Forwards =
8484 FuncInfo->getForwardedMustTailRegParms();
8485 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
8487
8488 // Conservatively forward X8, since it might be used for aggregate return.
8489 if (!CCInfo.isAllocated(AArch64::X8)) {
8490 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
8491 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
8492 }
8493 }
8494 }
8495
8496 // On Windows, InReg pointers must be returned, so record the pointer in a
8497 // virtual register at the start of the function so it can be returned in the
8498 // epilogue.
8499 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
8500 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
8501 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
8502 Ins[I].Flags.isInReg()) &&
8503 Ins[I].Flags.isSRet()) {
8504 assert(!FuncInfo->getSRetReturnReg());
8505
8506 MVT PtrTy = getPointerTy(DAG.getDataLayout());
8507 Register Reg =
8509 FuncInfo->setSRetReturnReg(Reg);
8510
8511 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
8512 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
8513 break;
8514 }
8515 }
8516 }
8517
8518 unsigned StackArgSize = CCInfo.getStackSize();
8519 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8520 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
8521 // This is a non-standard ABI so by fiat I say we're allowed to make full
8522 // use of the stack area to be popped, which must be aligned to 16 bytes in
8523 // any case:
8524 StackArgSize = alignTo(StackArgSize, 16);
8525
8526 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
8527 // a multiple of 16.
8528 FuncInfo->setArgumentStackToRestore(StackArgSize);
8529
8530 // This realignment carries over to the available bytes below. Our own
8531 // callers will guarantee the space is free by giving an aligned value to
8532 // CALLSEQ_START.
8533 }
8534 // Even if we're not expected to free up the space, it's useful to know how
8535 // much is there while considering tail calls (because we can reuse it).
8536 FuncInfo->setBytesInStackArgArea(StackArgSize);
8537
8538 if (Subtarget->hasCustomCallingConv())
8539 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
8540
8541 if (getTM().useNewSMEABILowering()) {
8542 if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
8543 SDValue Size;
8544 if (Attrs.hasZAState()) {
8545 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8546 DAG.getConstant(1, DL, MVT::i32));
8547 Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8548 } else if (Attrs.hasAgnosticZAInterface()) {
8549 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
8552 auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
8553 TargetLowering::CallLoweringInfo CLI(DAG);
8554 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8555 getLibcallCallingConv(LC), RetTy, Callee, {});
8556 std::tie(Size, Chain) = LowerCallTo(CLI);
8557 }
8558 if (Size) {
8559 SDValue Buffer = DAG.getNode(
8560 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8561 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8562 Chain = Buffer.getValue(1);
8563
8564 Register BufferPtr =
8565 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8566 Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
8567 Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
8568 DAG.getVTList(MVT::Other), Chain);
8569 FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
8570 MFI.CreateVariableSizedObject(Align(16), nullptr);
8571 }
8572 }
8573 } else {
8574 // Old SME ABI lowering (deprecated):
8575 // Create a 16 Byte TPIDR2 object. The dynamic buffer
8576 // will be expanded and stored in the static object later using a
8577 // pseudonode.
8578 if (Attrs.hasZAState()) {
8579 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8580 TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
8581 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8582 DAG.getConstant(1, DL, MVT::i32));
8583 SDValue Buffer;
8584 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8585 Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
8586 DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
8587 } else {
8588 SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8589 Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
8590 DAG.getVTList(MVT::i64, MVT::Other),
8591 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8592 MFI.CreateVariableSizedObject(Align(16), nullptr);
8593 }
8594 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8595 DAG.getConstant(1, DL, MVT::i32));
8596 Chain = DAG.getNode(
8597 AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
8598 {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0),
8599 /*Num save slices*/ NumZaSaveSlices});
8600 } else if (Attrs.hasAgnosticZAInterface()) {
8601 // Call __arm_sme_state_size().
8602 SDValue BufferSize =
8603 DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL,
8604 DAG.getVTList(MVT::i64, MVT::Other), Chain);
8605 Chain = BufferSize.getValue(1);
8606 SDValue Buffer;
8607 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8608 Buffer = DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,
8609 DAG.getVTList(MVT::i64, MVT::Other),
8610 {Chain, BufferSize});
8611 } else {
8612 // Allocate space dynamically.
8613 Buffer = DAG.getNode(
8614 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8615 {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
8616 MFI.CreateVariableSizedObject(Align(16), nullptr);
8617 }
8618 // Copy the value to a virtual register, and save that in FuncInfo.
8619 Register BufferPtr =
8620 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8621 FuncInfo->setSMESaveBufferAddr(BufferPtr);
8622 Chain = DAG.getCopyToReg(Buffer.getValue(1), DL, BufferPtr, Buffer);
8623 }
8624 }
8625
8626 if (CallConv == CallingConv::PreserveNone) {
8627 for (const ISD::InputArg &I : Ins) {
8628 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
8629 I.Flags.isSwiftAsync()) {
8630 MachineFunction &MF = DAG.getMachineFunction();
8631 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
8632 MF.getFunction(),
8633 "Swift attributes can't be used with preserve_none",
8634 DL.getDebugLoc()));
8635 break;
8636 }
8637 }
8638 }
8639
8640 if (getTM().useNewSMEABILowering()) {
8641 // Clear new ZT0 state. TODO: Move this to the SME ABI pass.
8642 if (Attrs.isNewZT0())
8643 Chain = DAG.getNode(
8644 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8645 DAG.getConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32),
8646 DAG.getTargetConstant(0, DL, MVT::i32));
8647 }
8648
8649 return Chain;
8650}
8651
8652void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
8653 SelectionDAG &DAG,
8654 const SDLoc &DL,
8655 SDValue &Chain) const {
8656 MachineFunction &MF = DAG.getMachineFunction();
8657 MachineFrameInfo &MFI = MF.getFrameInfo();
8658 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8659 auto PtrVT = getPointerTy(DAG.getDataLayout());
8660 Function &F = MF.getFunction();
8661 bool IsWin64 =
8662 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8663
8665
8667 unsigned NumGPRArgRegs = GPRArgRegs.size();
8668 if (Subtarget->isWindowsArm64EC()) {
8669 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
8670 // functions.
8671 NumGPRArgRegs = 4;
8672 }
8673 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
8674
8675 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
8676 int GPRIdx = 0;
8677 if (GPRSaveSize != 0) {
8678 if (IsWin64) {
8679 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
8680 if (GPRSaveSize & 15)
8681 // The extra size here, if triggered, will always be 8.
8682 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
8683 } else
8684 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
8685
8686 SDValue FIN;
8687 if (Subtarget->isWindowsArm64EC()) {
8688 // With the Arm64EC ABI, we reserve the save area as usual, but we
8689 // compute its address relative to x4. For a normal AArch64->AArch64
8690 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
8691 // different address.
8692 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8693 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8694 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
8695 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
8696 } else {
8697 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
8698 }
8699
8700 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
8701 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
8702 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8703 SDValue Store =
8704 DAG.getStore(Val.getValue(1), DL, Val, FIN,
8706 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
8707 : MachinePointerInfo::getStack(MF, i * 8));
8708 MemOps.push_back(Store);
8709 FIN =
8710 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
8711 }
8712 }
8713 FuncInfo->setVarArgsGPRIndex(GPRIdx);
8714 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
8715
8716 if (Subtarget->hasFPARMv8() && !IsWin64) {
8718 const unsigned NumFPRArgRegs = FPRArgRegs.size();
8719 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
8720
8721 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
8722 int FPRIdx = 0;
8723 if (FPRSaveSize != 0) {
8724 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
8725
8726 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
8727
8728 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
8729 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
8730 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
8731
8732 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
8733 MachinePointerInfo::getStack(MF, i * 16));
8734 MemOps.push_back(Store);
8735 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
8736 DAG.getConstant(16, DL, PtrVT));
8737 }
8738 }
8739 FuncInfo->setVarArgsFPRIndex(FPRIdx);
8740 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
8741 }
8742
8743 if (!MemOps.empty()) {
8744 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
8745 }
8746}
8747
8748/// LowerCallResult - Lower the result values of a call into the
8749/// appropriate copies out of appropriate physical registers.
8750SDValue AArch64TargetLowering::LowerCallResult(
8751 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
8752 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
8753 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
8754 SDValue ThisVal, bool RequiresSMChange) const {
8755 DenseMap<unsigned, SDValue> CopiedRegs;
8756 // Copy all of the result registers out of their specified physreg.
8757 for (unsigned i = 0; i != RVLocs.size(); ++i) {
8758 CCValAssign VA = RVLocs[i];
8759
8760 // Pass 'this' value directly from the argument to return value, to avoid
8761 // reg unit interference
8762 if (i == 0 && isThisReturn) {
8763 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
8764 "unexpected return calling convention register assignment");
8765 InVals.push_back(ThisVal);
8766 continue;
8767 }
8768
8769 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
8770 // allows one use of a physreg per block.
8771 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
8772 if (!Val) {
8773 Val =
8774 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
8775 Chain = Val.getValue(1);
8776 InGlue = Val.getValue(2);
8777 CopiedRegs[VA.getLocReg()] = Val;
8778 }
8779
8780 switch (VA.getLocInfo()) {
8781 default:
8782 llvm_unreachable("Unknown loc info!");
8783 case CCValAssign::Full:
8784 break;
8785 case CCValAssign::BCvt:
8786 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
8787 break;
8789 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
8790 DAG.getConstant(32, DL, VA.getLocVT()));
8791 [[fallthrough]];
8792 case CCValAssign::AExt:
8793 [[fallthrough]];
8794 case CCValAssign::ZExt:
8795 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
8796 break;
8797 }
8798
8799 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
8800 Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8801 DAG.getVTList(Val.getValueType(), MVT::Glue), Val);
8802
8803 InVals.push_back(Val);
8804 }
8805
8806 return Chain;
8807}
8808
8809/// Return true if the calling convention is one that we can guarantee TCO for.
8810static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
8811 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
8813}
8814
8815/// Return true if we might ever do TCO for calls with this calling convention.
8817 switch (CC) {
8818 case CallingConv::C:
8823 case CallingConv::Swift:
8825 case CallingConv::Tail:
8826 case CallingConv::Fast:
8827 return true;
8828 default:
8829 return false;
8830 }
8831}
8832
8833/// Return true if the call convention supports varargs
8834/// Currently only those that pass varargs like the C
8835/// calling convention does are eligible
8836/// Calling conventions listed in this function must also
8837/// be properly handled in AArch64Subtarget::isCallingConvWin64
8839 switch (CC) {
8840 case CallingConv::C:
8842 // SVE vector call is only partially supported, but it should
8843 // support named arguments being passed. Any arguments being passed
8844 // as varargs, are still unsupported.
8846 return true;
8847 default:
8848 return false;
8849 }
8850}
8851
8853 const AArch64Subtarget *Subtarget,
8855 CCState &CCInfo) {
8856 const SelectionDAG &DAG = CLI.DAG;
8857 CallingConv::ID CalleeCC = CLI.CallConv;
8858 bool IsVarArg = CLI.IsVarArg;
8859 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8860 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
8861
8862 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
8863 // for the shadow store.
8864 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
8865 CCInfo.AllocateStack(32, Align(16));
8866
8867 unsigned NumArgs = Outs.size();
8868 for (unsigned i = 0; i != NumArgs; ++i) {
8869 MVT ArgVT = Outs[i].VT;
8870 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
8871
8872 bool UseVarArgCC = false;
8873 if (IsVarArg) {
8874 // On Windows, the fixed arguments in a vararg call are passed in GPRs
8875 // too, so use the vararg CC to force them to integer registers.
8876 if (IsCalleeWin64) {
8877 UseVarArgCC = true;
8878 } else {
8879 UseVarArgCC = ArgFlags.isVarArg();
8880 }
8881 }
8882
8883 if (!UseVarArgCC) {
8884 // Get type of the original argument.
8885 EVT ActualVT =
8886 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
8887 /*AllowUnknown*/ true);
8888 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
8889 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8890 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8891 ArgVT = MVT::i8;
8892 else if (ActualMVT == MVT::i16)
8893 ArgVT = MVT::i16;
8894 }
8895
8896 // FIXME: CCAssignFnForCall should be called once, for the call and not per
8897 // argument. This logic should exactly mirror LowerFormalArguments.
8898 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
8899 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
8900 Outs[i].OrigTy, CCInfo);
8901 assert(!Res && "Call operand has unhandled type");
8902 (void)Res;
8903 }
8904}
8905
8906static SMECallAttrs
8909 if (CLI.CB)
8910 return SMECallAttrs(*CLI.CB, &TLI);
8911 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
8912 return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI));
8914}
8915
8916bool AArch64TargetLowering::isEligibleForTailCallOptimization(
8917 const CallLoweringInfo &CLI) const {
8918 CallingConv::ID CalleeCC = CLI.CallConv;
8919 if (!mayTailCallThisCC(CalleeCC))
8920 return false;
8921
8922 SDValue Callee = CLI.Callee;
8923 bool IsVarArg = CLI.IsVarArg;
8924 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8925 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8926 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
8927 const SelectionDAG &DAG = CLI.DAG;
8928 MachineFunction &MF = DAG.getMachineFunction();
8929 const Function &CallerF = MF.getFunction();
8930 CallingConv::ID CallerCC = CallerF.getCallingConv();
8931
8932 // SME Streaming functions are not eligible for TCO as they may require
8933 // the streaming mode or ZA to be restored after returning from the call.
8934 SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI);
8935 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
8936 CallAttrs.requiresPreservingAllZAState() ||
8937 CallAttrs.caller().hasStreamingBody())
8938 return false;
8939
8940 // Functions using the C or Fast calling convention that have an SVE signature
8941 // preserve more registers and should assume the SVE_VectorCall CC.
8942 // The check for matching callee-saved regs will determine whether it is
8943 // eligible for TCO.
8944 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
8945 MF.getInfo<AArch64FunctionInfo>()->isSVECC())
8947
8948 bool CCMatch = CallerCC == CalleeCC;
8949
8950 // When using the Windows calling convention on a non-windows OS, we want
8951 // to back up and restore X18 in such functions; we can't do a tail call
8952 // from those functions.
8953 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
8954 CalleeCC != CallingConv::Win64)
8955 return false;
8956
8957 // Byval parameters hand the function a pointer directly into the stack area
8958 // we want to reuse during a tail call. Working around this *is* possible (see
8959 // X86) but less efficient and uglier in LowerCall.
8960 for (Function::const_arg_iterator i = CallerF.arg_begin(),
8961 e = CallerF.arg_end();
8962 i != e; ++i) {
8963 if (i->hasByValAttr())
8964 return false;
8965
8966 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
8967 // In this case, it is necessary to save X0/X1 in the callee and return it
8968 // in X0. Tail call opt may interfere with this, so we disable tail call
8969 // opt when the caller has an "inreg" attribute -- except if the callee
8970 // also has that attribute on the same argument, and the same value is
8971 // passed.
8972 if (i->hasInRegAttr()) {
8973 unsigned ArgIdx = i - CallerF.arg_begin();
8974 if (!CLI.CB || CLI.CB->arg_size() <= ArgIdx)
8975 return false;
8976 AttributeSet Attrs = CLI.CB->getParamAttributes(ArgIdx);
8977 if (!Attrs.hasAttribute(Attribute::InReg) ||
8978 !Attrs.hasAttribute(Attribute::StructRet) || !i->hasStructRetAttr() ||
8979 CLI.CB->getArgOperand(ArgIdx) != i) {
8980 return false;
8981 }
8982 }
8983 }
8984
8985 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
8986 return CCMatch;
8987
8988 // Externally-defined functions with weak linkage should not be
8989 // tail-called on AArch64 when the OS does not support dynamic
8990 // pre-emption of symbols, as the AAELF spec requires normal calls
8991 // to undefined weak functions to be replaced with a NOP or jump to the
8992 // next instruction. The behaviour of branch instructions in this
8993 // situation (as used for tail calls) is implementation-defined, so we
8994 // cannot rely on the linker replacing the tail call with a return.
8995 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8996 const GlobalValue *GV = G->getGlobal();
8997 const Triple &TT = getTargetMachine().getTargetTriple();
8998 if (GV->hasExternalWeakLinkage() &&
8999 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
9000 return false;
9001 }
9002
9003 // Now we search for cases where we can use a tail call without changing the
9004 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
9005 // concept.
9006
9007 // I want anyone implementing a new calling convention to think long and hard
9008 // about this assert.
9009 if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
9010 report_fatal_error("Unsupported variadic calling convention");
9011
9012 LLVMContext &C = *DAG.getContext();
9013 // Check that the call results are passed in the same way.
9014 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
9015 CCAssignFnForCall(CalleeCC, IsVarArg),
9016 CCAssignFnForCall(CallerCC, IsVarArg)))
9017 return false;
9018 // The callee has to preserve all registers the caller needs to preserve.
9019 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9020 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
9021 if (!CCMatch) {
9022 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
9023 if (Subtarget->hasCustomCallingConv()) {
9024 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
9025 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
9026 }
9027 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
9028 return false;
9029 }
9030
9031 // Nothing more to check if the callee is taking no arguments
9032 if (Outs.empty())
9033 return true;
9034
9036 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
9037
9038 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9039
9040 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
9041 // When we are musttail, additional checks have been done and we can safely ignore this check
9042 // At least two cases here: if caller is fastcc then we can't have any
9043 // memory arguments (we'd be expected to clean up the stack afterwards). If
9044 // caller is C then we could potentially use its argument area.
9045
9046 // FIXME: for now we take the most conservative of these in both cases:
9047 // disallow all variadic memory operands.
9048 for (const CCValAssign &ArgLoc : ArgLocs)
9049 if (!ArgLoc.isRegLoc())
9050 return false;
9051 }
9052
9053 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9054
9055 // If any of the arguments is passed indirectly, it must be SVE, so the
9056 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
9057 // allocate space on the stack. That is why we determine this explicitly here
9058 // the call cannot be a tailcall.
9059 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
9060 assert((A.getLocInfo() != CCValAssign::Indirect ||
9061 A.getValVT().isScalableVector() ||
9062 Subtarget->isWindowsArm64EC()) &&
9063 "Expected value to be scalable");
9064 return A.getLocInfo() == CCValAssign::Indirect;
9065 }))
9066 return false;
9067
9068 // If the stack arguments for this call do not fit into our own save area then
9069 // the call cannot be made tail.
9070 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
9071 return false;
9072
9073 const MachineRegisterInfo &MRI = MF.getRegInfo();
9074 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
9075 return false;
9076
9077 return true;
9078}
9079
9080SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
9081 SelectionDAG &DAG,
9082 MachineFrameInfo &MFI,
9083 int ClobberedFI) const {
9084 SmallVector<SDValue, 8> ArgChains;
9085 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
9086 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
9087
9088 // Include the original chain at the beginning of the list. When this is
9089 // used by target LowerCall hooks, this helps legalize find the
9090 // CALLSEQ_BEGIN node.
9091 ArgChains.push_back(Chain);
9092
9093 // Add a chain value for each stack argument corresponding
9094 for (SDNode *U : DAG.getEntryNode().getNode()->users())
9095 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
9096 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
9097 if (FI->getIndex() < 0) {
9098 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
9099 int64_t InLastByte = InFirstByte;
9100 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
9101
9102 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
9103 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
9104 ArgChains.push_back(SDValue(L, 1));
9105 }
9106
9107 // Build a tokenfactor for all the chains.
9108 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
9109}
9110
9111bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
9112 bool TailCallOpt) const {
9113 return (CallCC == CallingConv::Fast && TailCallOpt) ||
9114 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
9115}
9116
9117// Check if the value is zero-extended from i1 to i8
9118static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
9119 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
9120 if (SizeInBits < 8)
9121 return false;
9122
9123 APInt RequiredZero(SizeInBits, 0xFE);
9124 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
9125 bool ZExtBool = (Bits.Zero & RequiredZero) == RequiredZero;
9126 return ZExtBool;
9127}
9128
9129void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
9130 SDNode *Node) const {
9131 // Live-in physreg copies that are glued to SMSTART are applied as
9132 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
9133 // register allocator to pass call args in callee saved regs, without extra
9134 // copies to avoid these fake clobbers of actually-preserved GPRs.
9135 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
9136 MI.getOpcode() == AArch64::MSRpstatePseudo) {
9137 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
9138 if (MachineOperand &MO = MI.getOperand(I);
9139 MO.isReg() && MO.isImplicit() && MO.isDef() &&
9140 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
9141 AArch64::GPR64RegClass.contains(MO.getReg())))
9142 MI.removeOperand(I);
9143
9144 // The SVE vector length can change when entering/leaving streaming mode.
9145 // FPMR is set to 0 when entering/leaving streaming mode.
9146 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
9147 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
9148 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9149 /*IsImplicit=*/true));
9150 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
9151 /*IsImplicit=*/true));
9152 MI.addOperand(MachineOperand::CreateReg(AArch64::FPMR, /*IsDef=*/true,
9153 /*IsImplicit=*/true));
9154 }
9155 }
9156
9157 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
9158 // have nothing to do with VG, were it not that they are used to materialise a
9159 // frame-address. If they contain a frame-index to a scalable vector, this
9160 // will likely require an ADDVL instruction to materialise the address, thus
9161 // reading VG.
9162 const MachineFunction &MF = *MI.getMF();
9163 if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
9164 (MI.getOpcode() == AArch64::ADDXri ||
9165 MI.getOpcode() == AArch64::SUBXri)) {
9166 const MachineOperand &MO = MI.getOperand(1);
9167 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
9169 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9170 /*IsImplicit=*/true));
9171 }
9172}
9173
9175 SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue,
9176 unsigned Condition, bool InsertVectorLengthCheck) const {
9179 FuncInfo->setHasStreamingModeChanges(true);
9180
9181 auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue {
9182 SmallVector<SDValue, 2> Ops = {Chain};
9183 if (InGlue)
9184 Ops.push_back(InGlue);
9185 return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL,
9186 DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9187 };
9188
9189 if (InsertVectorLengthCheck && Enable) {
9190 // Non-streaming -> Streaming
9191 // Insert vector length check before smstart
9192 SDValue CheckVL = GetCheckVL(Chain, InGlue);
9193 Chain = CheckVL.getValue(0);
9194 InGlue = CheckVL.getValue(1);
9195 }
9196
9197 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9198 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
9199 SDValue MSROp =
9200 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
9201 SmallVector<SDValue> Ops = {Chain, MSROp};
9202 unsigned Opcode;
9203 if (Condition != AArch64SME::Always) {
9204 Register PStateReg = FuncInfo->getPStateSMReg();
9205 assert(PStateReg.isValid() && "PStateSM Register is invalid");
9206 SDValue PStateSM =
9207 DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue);
9208 // Use chain and glue from the CopyFromReg.
9209 Ops[0] = PStateSM.getValue(1);
9210 InGlue = PStateSM.getValue(2);
9211 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
9212 Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
9213 Ops.push_back(ConditionOp);
9214 Ops.push_back(PStateSM);
9215 } else {
9216 Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
9217 }
9218 Ops.push_back(RegMask);
9219
9220 if (InGlue)
9221 Ops.push_back(InGlue);
9222
9223 SDValue SMChange =
9224 DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9225
9226 if (!InsertVectorLengthCheck || Enable)
9227 return SMChange;
9228
9229 // Streaming -> Non-streaming
9230 // Insert vector length check after smstop since we cannot read VL
9231 // in streaming mode
9232 return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1));
9233}
9234
9235// Emit a call to __arm_sme_save or __arm_sme_restore.
9237 SelectionDAG &DAG,
9239 SDValue Chain, bool IsSave) {
9242 FuncInfo->setSMESaveBufferUsed();
9244 Args.emplace_back(
9245 DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
9247
9248 RTLIB::Libcall LC =
9249 IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
9250 SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
9251 TLI.getPointerTy(DAG.getDataLayout()));
9252 auto *RetTy = Type::getVoidTy(*DAG.getContext());
9254 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
9255 TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
9256 return TLI.LowerCallTo(CLI).second;
9257}
9258
9261 if (!CallAttrs.caller().hasStreamingCompatibleInterface() ||
9262 CallAttrs.caller().hasStreamingBody())
9263 return AArch64SME::Always;
9264 if (CallAttrs.callee().hasNonStreamingInterface())
9266 if (CallAttrs.callee().hasStreamingInterface())
9268
9269 llvm_unreachable("Unsupported attributes");
9270}
9271
9272/// Check whether a stack argument requires lowering in a tail call.
9274 const CCValAssign &VA, SDValue Arg,
9275 ISD::ArgFlagsTy Flags, int CallOffset) {
9276 // FIXME: We should be able to handle this case, but it's not clear how to.
9277 if (Flags.isZExt() || Flags.isSExt())
9278 return true;
9279
9280 for (;;) {
9281 // Look through nodes that don't alter the bits of the incoming value.
9282 unsigned Op = Arg.getOpcode();
9283 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
9284 Arg->isAssert() || Op == AArch64ISD::ASSERT_ZEXT_BOOL) {
9285 Arg = Arg.getOperand(0);
9286 continue;
9287 }
9288 break;
9289 }
9290
9291 // If the argument is a load from the same immutable stack slot, we can reuse
9292 // it.
9293 if (auto *LoadNode = dyn_cast<LoadSDNode>(Arg)) {
9294 if (auto *FINode = dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) {
9295 const MachineFrameInfo &MFI = MF.getFrameInfo();
9296 int FI = FINode->getIndex();
9297 if (!MFI.isImmutableObjectIndex(FI))
9298 return true;
9299 if (CallOffset != MFI.getObjectOffset(FI))
9300 return true;
9301 uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
9302 if (SizeInBits / 8 != static_cast<uint64_t>(MFI.getObjectSize(FI)))
9303 return true;
9304 return false;
9305 }
9306 }
9307
9308 return true;
9309}
9310
9311/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
9312/// and add input and output parameter nodes.
9313SDValue
9314AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
9315 SmallVectorImpl<SDValue> &InVals) const {
9316 SelectionDAG &DAG = CLI.DAG;
9317 SDLoc &DL = CLI.DL;
9318 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9319 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9321 SDValue Chain = CLI.Chain;
9322 SDValue Callee = CLI.Callee;
9323 bool &IsTailCall = CLI.IsTailCall;
9324 CallingConv::ID &CallConv = CLI.CallConv;
9325 bool IsVarArg = CLI.IsVarArg;
9326 const CallBase *CB = CLI.CB;
9327
9328 MachineFunction &MF = DAG.getMachineFunction();
9329 MachineFunction::CallSiteInfo CSInfo;
9330 bool IsThisReturn = false;
9331
9332 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9333 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
9334 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
9335 bool IsSibCall = false;
9336 bool GuardWithBTI = false;
9337
9338 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
9339 !Subtarget->noBTIAtReturnTwice()) {
9340 GuardWithBTI = FuncInfo->branchTargetEnforcement();
9341 }
9342
9343 // Analyze operands of the call, assigning locations to each operand.
9345 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
9346
9347 if (IsVarArg) {
9348 unsigned NumArgs = Outs.size();
9349
9350 for (unsigned i = 0; i != NumArgs; ++i) {
9351 if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector())
9352 report_fatal_error("Passing SVE types to variadic functions is "
9353 "currently not supported");
9354 }
9355 }
9356
9357 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9358
9359 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9360 // Assign locations to each value returned by this call.
9362 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
9363 *DAG.getContext());
9364 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
9365
9366 // Set type id for call site info.
9367 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
9368 CSInfo = MachineFunction::CallSiteInfo(*CB);
9369
9370 // Check callee args/returns for SVE registers and set calling convention
9371 // accordingly.
9372 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
9373 auto HasSVERegLoc = [](CCValAssign &Loc) {
9374 if (!Loc.isRegLoc())
9375 return false;
9376 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
9377 AArch64::PPRRegClass.contains(Loc.getLocReg());
9378 };
9379 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
9381 }
9382
9383 // Determine whether we need any streaming mode changes.
9384 SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI);
9385
9386 std::optional<unsigned> ZAMarkerNode;
9387 bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
9388
9389 if (UseNewSMEABILowering) {
9390 if (CallAttrs.requiresLazySave() ||
9391 CallAttrs.requiresPreservingAllZAState())
9392 ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;
9393 else if (CallAttrs.caller().hasZAState() ||
9394 CallAttrs.caller().hasZT0State())
9395 ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;
9396 }
9397
9398 if (IsTailCall) {
9399 // Check if it's really possible to do a tail call.
9400 IsTailCall = isEligibleForTailCallOptimization(CLI);
9401
9402 // A sibling call is one where we're under the usual C ABI and not planning
9403 // to change that but can still do a tail call:
9404 if (!ZAMarkerNode && !TailCallOpt && IsTailCall &&
9405 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
9406 IsSibCall = true;
9407
9408 if (IsTailCall)
9409 ++NumTailCalls;
9410 }
9411
9412 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
9413 report_fatal_error("failed to perform tail call elimination on a call "
9414 "site marked musttail");
9415
9416 // Get a count of how many bytes are to be pushed on the stack.
9417 unsigned NumBytes = CCInfo.getStackSize();
9418
9419 if (IsSibCall) {
9420 // Since we're not changing the ABI to make this a tail call, the memory
9421 // operands are already available in the caller's incoming argument space.
9422 NumBytes = 0;
9423 }
9424
9425 // FPDiff is the byte offset of the call's argument area from the callee's.
9426 // Stores to callee stack arguments will be placed in FixedStackSlots offset
9427 // by this amount for a tail call. In a sibling call it must be 0 because the
9428 // caller will deallocate the entire stack and the callee still expects its
9429 // arguments to begin at SP+0. Completely unused for non-tail calls.
9430 int FPDiff = 0;
9431
9432 if (IsTailCall && !IsSibCall) {
9433 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
9434
9435 // Since callee will pop argument stack as a tail call, we must keep the
9436 // popped size 16-byte aligned.
9437 NumBytes = alignTo(NumBytes, 16);
9438
9439 // FPDiff will be negative if this tail call requires more space than we
9440 // would automatically have in our incoming argument space. Positive if we
9441 // can actually shrink the stack.
9442 FPDiff = NumReusableBytes - NumBytes;
9443
9444 // Update the required reserved area if this is the tail call requiring the
9445 // most argument stack space.
9446 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
9447 FuncInfo->setTailCallReservedStack(-FPDiff);
9448
9449 // The stack pointer must be 16-byte aligned at all times it's used for a
9450 // memory operation, which in practice means at *all* times and in
9451 // particular across call boundaries. Therefore our own arguments started at
9452 // a 16-byte aligned SP and the delta applied for the tail call should
9453 // satisfy the same constraint.
9454 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
9455 }
9456
9457 auto DescribeCallsite =
9458 [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
9459 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
9460 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9461 R << ore::NV("Callee", ES->getSymbol());
9462 else if (CLI.CB && CLI.CB->getCalledFunction())
9463 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
9464 else
9465 R << "unknown callee";
9466 R << "'";
9467 return R;
9468 };
9469
9470 bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
9471 bool RequiresSaveAllZA =
9472 !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState();
9473 if (RequiresLazySave) {
9474 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9475 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
9476 TPIDR2.FrameIndex,
9478 Chain = DAG.getNode(
9479 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
9480 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9481 TPIDR2ObjAddr);
9482 OptimizationRemarkEmitter ORE(&MF.getFunction());
9483 ORE.emit([&]() {
9484 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9485 CLI.CB)
9486 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9487 &MF.getFunction());
9488 return DescribeCallsite(R) << " sets up a lazy save for ZA";
9489 });
9490 } else if (RequiresSaveAllZA) {
9491 assert(!CallAttrs.callee().hasSharedZAInterface() &&
9492 "Cannot share state that may not exist");
9493 Chain = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,
9494 /*IsSave=*/true);
9495 }
9496
9497 bool RequiresSMChange = CallAttrs.requiresSMChange();
9498 if (RequiresSMChange) {
9499 OptimizationRemarkEmitter ORE(&MF.getFunction());
9500 ORE.emit([&]() {
9501 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9502 CLI.CB)
9503 : OptimizationRemarkAnalysis("sme", "SMETransition",
9504 &MF.getFunction());
9505 DescribeCallsite(R) << " requires a streaming mode transition";
9506 return R;
9507 });
9508 }
9509
9510 SDValue ZTFrameIdx;
9511 MachineFrameInfo &MFI = MF.getFrameInfo();
9512 bool ShouldPreserveZT0 = CallAttrs.requiresPreservingZT0();
9513
9514 // If the caller has ZT0 state which will not be preserved by the callee,
9515 // spill ZT0 before the call.
9516 if (ShouldPreserveZT0) {
9517 ZTFrameIdx = getZT0FrameIndex(MFI, *FuncInfo, DAG);
9518
9519 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
9520 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9521 }
9522
9523 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
9524 // PSTATE.ZA before the call if there is no lazy-save active.
9525 bool DisableZA = CallAttrs.requiresDisablingZABeforeCall();
9526 assert((!DisableZA || !RequiresLazySave) &&
9527 "Lazy-save should have PSTATE.SM=1 on entry to the function");
9528
9529 if (DisableZA)
9530 Chain = DAG.getNode(
9531 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
9532 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
9533
9534 // Adjust the stack pointer for the new arguments... and mark ZA uses.
9535 // These operations are automatically eliminated by the prolog/epilog pass
9536 assert((!IsSibCall || !ZAMarkerNode) && "ZA markers require CALLSEQ_START");
9537 if (!IsSibCall) {
9538 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
9539 if (ZAMarkerNode) {
9540 // Note: We need the CALLSEQ_START to glue the ZAMarkerNode to, simply
9541 // using a chain can result in incorrect scheduling. The markers refer to
9542 // the position just before the CALLSEQ_START (though occur after as
9543 // CALLSEQ_START lacks in-glue).
9544 Chain = DAG.getNode(*ZAMarkerNode, DL, DAG.getVTList(MVT::Other),
9545 {Chain, Chain.getValue(1)});
9546 }
9547 }
9548
9549 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
9551
9553 SmallSet<unsigned, 8> RegsUsed;
9554 SmallVector<SDValue, 8> MemOpChains;
9555 auto PtrVT = getPointerTy(DAG.getDataLayout());
9556
9557 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
9558 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
9559 for (const auto &F : Forwards) {
9560 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
9561 RegsToPass.emplace_back(F.PReg, Val);
9562 }
9563 }
9564
9565 // Walk the register/memloc assignments, inserting copies/loads.
9566 unsigned ExtraArgLocs = 0;
9567 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
9568 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
9569 SDValue Arg = OutVals[i];
9570 ISD::ArgFlagsTy Flags = Outs[i].Flags;
9571
9572 // Promote the value if needed.
9573 switch (VA.getLocInfo()) {
9574 default:
9575 llvm_unreachable("Unknown loc info!");
9576 case CCValAssign::Full:
9577 break;
9578 case CCValAssign::SExt:
9579 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
9580 break;
9581 case CCValAssign::ZExt:
9582 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9583 break;
9584 case CCValAssign::AExt:
9585 if (Outs[i].ArgVT == MVT::i1) {
9586 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
9587 //
9588 // Check if we actually have to do this, because the value may
9589 // already be zero-extended.
9590 //
9591 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
9592 // and rely on DAGCombiner to fold this, because the following
9593 // (anyext i32) is combined with (zext i8) in DAG.getNode:
9594 //
9595 // (ext (zext x)) -> (zext x)
9596 //
9597 // This will give us (zext i32), which we cannot remove, so
9598 // try to check this beforehand.
9599 if (!checkZExtBool(Arg, DAG)) {
9600 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9601 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
9602 }
9603 }
9604 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9605 break;
9607 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9608 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9609 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9610 DAG.getConstant(32, DL, VA.getLocVT()));
9611 break;
9612 case CCValAssign::BCvt:
9613 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
9614 break;
9615 case CCValAssign::Trunc:
9616 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9617 break;
9618 case CCValAssign::FPExt:
9619 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
9620 break;
9622 bool isScalable = VA.getValVT().isScalableVT();
9623 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
9624 "Indirect arguments should be scalable on most subtargets");
9625
9626 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
9627 uint64_t PartSize = StoreSize;
9628 unsigned NumParts = 1;
9629 if (Outs[i].Flags.isInConsecutiveRegs()) {
9630 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
9631 ++NumParts;
9632 StoreSize *= NumParts;
9633 }
9634
9635 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
9636 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
9637 MachineFrameInfo &MFI = MF.getFrameInfo();
9638 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
9639 if (isScalable)
9641
9642 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
9645 SDValue SpillSlot = Ptr;
9646
9647 // Ensure we generate all stores for each tuple part, whilst updating the
9648 // pointer after each store correctly using vscale.
9649 while (NumParts) {
9650 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
9651 MemOpChains.push_back(Store);
9652
9653 NumParts--;
9654 if (NumParts > 0) {
9655 SDValue BytesIncrement;
9656 if (isScalable) {
9657 BytesIncrement = DAG.getVScale(
9658 DL, Ptr.getValueType(),
9659 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
9660 } else {
9661 BytesIncrement = DAG.getConstant(
9662 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
9663 Ptr.getValueType());
9664 }
9665 MPI = MachinePointerInfo(MPI.getAddrSpace());
9666 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
9667 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
9668 ExtraArgLocs++;
9669 i++;
9670 }
9671 }
9672
9673 Arg = SpillSlot;
9674 break;
9675 }
9676
9677 if (VA.isRegLoc()) {
9678 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
9679 Outs[0].VT == MVT::i64) {
9680 assert(VA.getLocVT() == MVT::i64 &&
9681 "unexpected calling convention register assignment");
9682 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
9683 "unexpected use of 'returned'");
9684 IsThisReturn = true;
9685 }
9686 if (RegsUsed.count(VA.getLocReg())) {
9687 // If this register has already been used then we're trying to pack
9688 // parts of an [N x i32] into an X-register. The extension type will
9689 // take care of putting the two halves in the right place but we have to
9690 // combine them.
9691 SDValue &Bits =
9692 llvm::find_if(RegsToPass,
9693 [=](const std::pair<unsigned, SDValue> &Elt) {
9694 return Elt.first == VA.getLocReg();
9695 })
9696 ->second;
9697 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9698 // Call site info is used for function's parameter entry value
9699 // tracking. For now we track only simple cases when parameter
9700 // is transferred through whole register.
9702 [&VA](MachineFunction::ArgRegPair ArgReg) {
9703 return ArgReg.Reg == VA.getLocReg();
9704 });
9705 } else {
9706 // Add an extra level of indirection for streaming mode changes by
9707 // using a pseudo copy node that cannot be rematerialised between a
9708 // smstart/smstop and the call by the simple register coalescer.
9709 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
9710 Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9711 DAG.getVTList(Arg.getValueType(), MVT::Glue), Arg);
9712 RegsToPass.emplace_back(VA.getLocReg(), Arg);
9713 RegsUsed.insert(VA.getLocReg());
9714 const TargetOptions &Options = DAG.getTarget().Options;
9715 if (Options.EmitCallSiteInfo)
9716 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
9717 }
9718 } else {
9719 assert(VA.isMemLoc());
9720
9721 SDValue DstAddr;
9722 MachinePointerInfo DstInfo;
9723
9724 // FIXME: This works on big-endian for composite byvals, which are the
9725 // common case. It should also work for fundamental types too.
9726 uint32_t BEAlign = 0;
9727 unsigned OpSize;
9728 if (VA.getLocInfo() == CCValAssign::Indirect ||
9730 OpSize = VA.getLocVT().getFixedSizeInBits();
9731 else
9732 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
9733 : VA.getValVT().getSizeInBits();
9734 OpSize = (OpSize + 7) / 8;
9735 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
9736 !Flags.isInConsecutiveRegs()) {
9737 if (OpSize < 8)
9738 BEAlign = 8 - OpSize;
9739 }
9740 unsigned LocMemOffset = VA.getLocMemOffset();
9741 int32_t Offset = LocMemOffset + BEAlign;
9742
9743 if (IsTailCall) {
9744 // When the frame pointer is perfectly aligned for the tail call and the
9745 // same stack argument is passed down intact, we can reuse it.
9746 if (!FPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags, Offset))
9747 continue;
9748
9749 Offset = Offset + FPDiff;
9750 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
9751
9752 DstAddr = DAG.getFrameIndex(FI, PtrVT);
9753 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
9754
9755 // Make sure any stack arguments overlapping with where we're storing
9756 // are loaded before this eventual operation. Otherwise they'll be
9757 // clobbered.
9758 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
9759 } else {
9760 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
9761
9762 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
9763 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
9764 }
9765
9766 if (Outs[i].Flags.isByVal()) {
9767 SDValue SizeNode =
9768 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
9769 SDValue Cpy = DAG.getMemcpy(
9770 Chain, DL, DstAddr, Arg, SizeNode,
9771 Outs[i].Flags.getNonZeroByValAlign(),
9772 /*isVol = */ false, /*AlwaysInline = */ false,
9773 /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
9774
9775 MemOpChains.push_back(Cpy);
9776 } else {
9777 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
9778 // promoted to a legal register type i32, we should truncate Arg back to
9779 // i1/i8/i16.
9780 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
9781 VA.getValVT() == MVT::i16)
9782 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
9783
9784 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
9785 MemOpChains.push_back(Store);
9786 }
9787 }
9788 }
9789
9790 if (IsVarArg && Subtarget->isWindowsArm64EC() &&
9791 !(CLI.CB && CLI.CB->isMustTailCall())) {
9792 SDValue ParamPtr = StackPtr;
9793 if (IsTailCall) {
9794 // Create a dummy object at the top of the stack that can be used to get
9795 // the SP after the epilogue
9796 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
9797 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
9798 }
9799
9800 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
9801 // describing the argument list. x4 contains the address of the
9802 // first stack parameter. x5 contains the size in bytes of all parameters
9803 // passed on the stack.
9804 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
9805 RegsToPass.emplace_back(AArch64::X5,
9806 DAG.getConstant(NumBytes, DL, MVT::i64));
9807 }
9808
9809 if (!MemOpChains.empty())
9810 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
9811
9812 SDValue InGlue;
9813 if (RequiresSMChange) {
9814 bool InsertVectorLengthCheck =
9816 Chain = changeStreamingMode(
9817 DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
9818 getSMToggleCondition(CallAttrs), InsertVectorLengthCheck);
9819 InGlue = Chain.getValue(1);
9820 }
9821
9822 // Build a sequence of copy-to-reg nodes chained together with token chain
9823 // and flag operands which copy the outgoing args into the appropriate regs.
9824 for (auto &RegToPass : RegsToPass) {
9825 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
9826 RegToPass.second, InGlue);
9827 InGlue = Chain.getValue(1);
9828 }
9829
9830 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
9831 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
9832 // node so that legalize doesn't hack it.
9833 const GlobalValue *CalledGlobal = nullptr;
9834 unsigned OpFlags = 0;
9835 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9836 CalledGlobal = G->getGlobal();
9837 OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
9839 if (OpFlags & AArch64II::MO_GOT) {
9840 Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
9841 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9842 } else {
9843 const GlobalValue *GV = G->getGlobal();
9844 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
9845 }
9846 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
9847 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
9848 Subtarget->isTargetMachO()) ||
9850 const char *Sym = S->getSymbol();
9851 if (UseGot) {
9853 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9854 } else {
9855 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
9856 }
9857 }
9858
9859 // We don't usually want to end the call-sequence here because we would tidy
9860 // the frame up *after* the call, however in the ABI-changing tail-call case
9861 // we've carefully laid out the parameters so that when sp is reset they'll be
9862 // in the correct location.
9863 if (IsTailCall && !IsSibCall) {
9864 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
9865 InGlue = Chain.getValue(1);
9866 }
9867
9868 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
9869
9870 std::vector<SDValue> Ops;
9871 Ops.push_back(Chain);
9872 Ops.push_back(Callee);
9873
9874 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
9875 // be expanded to the call, directly followed by a special marker sequence and
9876 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
9877 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
9878 assert(!IsTailCall &&
9879 "tail calls cannot be marked with clang.arc.attachedcall");
9880 Opc = AArch64ISD::CALL_RVMARKER;
9881
9882 // Add a target global address for the retainRV/claimRV runtime function
9883 // just before the call target.
9884 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
9885 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
9886 Ops.insert(Ops.begin() + 1, GA);
9887
9888 // We may or may not need to emit both the marker and the retain/claim call.
9889 // Tell the pseudo expansion using an additional boolean op.
9890 bool ShouldEmitMarker = objcarc::attachedCallOpBundleNeedsMarker(CLI.CB);
9891 SDValue DoEmitMarker =
9892 DAG.getTargetConstant(ShouldEmitMarker, DL, MVT::i32);
9893 Ops.insert(Ops.begin() + 2, DoEmitMarker);
9894 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9895 Opc = AArch64ISD::CALL_ARM64EC_TO_X64;
9896 } else if (GuardWithBTI) {
9897 Opc = AArch64ISD::CALL_BTI;
9898 }
9899
9900 if (IsTailCall) {
9901 // Each tail call may have to adjust the stack by a different amount, so
9902 // this information must travel along with the operation for eventual
9903 // consumption by emitEpilogue.
9904 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
9905 }
9906
9907 if (CLI.PAI) {
9908 const uint64_t Key = CLI.PAI->Key;
9910 "Invalid auth call key");
9911
9912 // Split the discriminator into address/integer components.
9913 SDValue AddrDisc, IntDisc;
9914 std::tie(IntDisc, AddrDisc) =
9915 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
9916
9917 if (Opc == AArch64ISD::CALL_RVMARKER)
9918 Opc = AArch64ISD::AUTH_CALL_RVMARKER;
9919 else
9920 Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;
9921 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
9922 Ops.push_back(IntDisc);
9923 Ops.push_back(AddrDisc);
9924 }
9925
9926 // Add argument registers to the end of the list so that they are known live
9927 // into the call.
9928 for (auto &RegToPass : RegsToPass)
9929 Ops.push_back(DAG.getRegister(RegToPass.first,
9930 RegToPass.second.getValueType()));
9931
9932 // Add a register mask operand representing the call-preserved registers.
9933 const uint32_t *Mask;
9934 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9935 if (IsThisReturn) {
9936 // For 'this' returns, use the X0-preserving mask if applicable
9937 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
9938 if (!Mask) {
9939 IsThisReturn = false;
9940 Mask = TRI->getCallPreservedMask(MF, CallConv);
9941 }
9942 } else
9943 Mask = TRI->getCallPreservedMask(MF, CallConv);
9944
9945 if (Subtarget->hasCustomCallingConv())
9946 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
9947
9948 if (TRI->isAnyArgRegReserved(MF))
9949 TRI->emitReservedArgRegCallError(MF);
9950
9951 assert(Mask && "Missing call preserved mask for calling convention");
9952 Ops.push_back(DAG.getRegisterMask(Mask));
9953
9954 if (InGlue.getNode())
9955 Ops.push_back(InGlue);
9956
9957 // If we're doing a tall call, use a TC_RETURN here rather than an
9958 // actual call instruction.
9959 if (IsTailCall) {
9961 SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);
9962 if (IsCFICall)
9963 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9964
9965 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
9966 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
9967 if (CalledGlobal &&
9968 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
9969 DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
9970 return Ret;
9971 }
9972
9973 // Returns a chain and a flag for retval copy to use.
9974 Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);
9975 if (IsCFICall)
9976 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9977
9978 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
9979 InGlue = Chain.getValue(1);
9980 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
9981 if (CalledGlobal &&
9982 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
9983 DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
9984
9985 uint64_t CalleePopBytes =
9986 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
9987
9988 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
9989 InGlue = Chain.getValue(1);
9990
9991 // Handle result values, copying them out of physregs into vregs that we
9992 // return.
9993 SDValue Result = LowerCallResult(
9994 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
9995 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
9996
9997 if (!Ins.empty())
9998 InGlue = Result.getValue(Result->getNumValues() - 1);
9999
10000 if (RequiresSMChange) {
10002 DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,
10003 getSMToggleCondition(CallAttrs));
10004 }
10005
10006 if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall())
10007 // Unconditionally resume ZA.
10008 Result = DAG.getNode(
10009 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result,
10010 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
10011
10012 if (ShouldPreserveZT0)
10013 Result =
10014 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
10015 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
10016
10017 if (RequiresLazySave) {
10018 // Conditionally restore the lazy save using a pseudo node.
10019 RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
10020 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
10021 SDValue RegMask = DAG.getRegisterMask(
10022 TRI->getCallPreservedMask(MF, getLibcallCallingConv(LC)));
10023 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
10025 SDValue TPIDR2_EL0 = DAG.getNode(
10026 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
10027 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
10028 // Copy the address of the TPIDR2 block into X0 before 'calling' the
10029 // RESTORE_ZA pseudo.
10030 SDValue Glue;
10031 SDValue TPIDR2Block = DAG.getFrameIndex(
10032 TPIDR2.FrameIndex,
10034 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
10035 Result =
10036 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
10037 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
10038 RestoreRoutine, RegMask, Result.getValue(1)});
10039 // Finally reset the TPIDR2_EL0 register to 0.
10040 Result = DAG.getNode(
10041 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
10042 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
10043 DAG.getConstant(0, DL, MVT::i64));
10044 TPIDR2.Uses++;
10045 } else if (RequiresSaveAllZA) {
10046 Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
10047 /*IsSave=*/false);
10048 }
10049
10050 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0 ||
10051 RequiresSaveAllZA) {
10052 for (unsigned I = 0; I < InVals.size(); ++I) {
10053 // The smstart/smstop is chained as part of the call, but when the
10054 // resulting chain is discarded (which happens when the call is not part
10055 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
10056 // smstart/smstop is chained to the result value. We can do that by doing
10057 // a vreg -> vreg copy.
10060 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
10061 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
10062 InVals[I].getValueType());
10063 }
10064 }
10065
10066 if (CallConv == CallingConv::PreserveNone) {
10067 for (const ISD::OutputArg &O : Outs) {
10068 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
10069 O.Flags.isSwiftAsync()) {
10070 MachineFunction &MF = DAG.getMachineFunction();
10071 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10072 MF.getFunction(),
10073 "Swift attributes can't be used with preserve_none",
10074 DL.getDebugLoc()));
10075 break;
10076 }
10077 }
10078 }
10079
10080 return Result;
10081}
10082
10083bool AArch64TargetLowering::CanLowerReturn(
10084 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
10086 const Type *RetTy) const {
10087 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10089 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
10090 return CCInfo.CheckReturn(Outs, RetCC);
10091}
10092
10093SDValue
10094AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
10095 bool isVarArg,
10097 const SmallVectorImpl<SDValue> &OutVals,
10098 const SDLoc &DL, SelectionDAG &DAG) const {
10099 auto &MF = DAG.getMachineFunction();
10100 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10101
10102 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10104 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
10105 CCInfo.AnalyzeReturn(Outs, RetCC);
10106
10107 // Copy the result values into the output registers.
10108 SDValue Glue;
10110 SmallSet<unsigned, 4> RegsUsed;
10111 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
10112 ++i, ++realRVLocIdx) {
10113 CCValAssign &VA = RVLocs[i];
10114 assert(VA.isRegLoc() && "Can only return in registers!");
10115 SDValue Arg = OutVals[realRVLocIdx];
10116
10117 switch (VA.getLocInfo()) {
10118 default:
10119 llvm_unreachable("Unknown loc info!");
10120 case CCValAssign::Full:
10121 if (Outs[i].ArgVT == MVT::i1) {
10122 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
10123 // value. This is strictly redundant on Darwin (which uses "zeroext
10124 // i1"), but will be optimised out before ISel.
10125 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
10126 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
10127 }
10128 break;
10129 case CCValAssign::BCvt:
10130 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
10131 break;
10132 case CCValAssign::AExt:
10133 case CCValAssign::ZExt:
10134 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10135 break;
10137 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
10138 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10139 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
10140 DAG.getConstant(32, DL, VA.getLocVT()));
10141 break;
10142 }
10143
10144 if (RegsUsed.count(VA.getLocReg())) {
10145 SDValue &Bits =
10146 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
10147 return Elt.first == VA.getLocReg();
10148 })->second;
10149 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
10150 } else {
10151 RetVals.emplace_back(VA.getLocReg(), Arg);
10152 RegsUsed.insert(VA.getLocReg());
10153 }
10154 }
10155
10156 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10157
10158 // Emit SMSTOP before returning from a locally streaming function
10159 SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
10160 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
10161 if (FuncAttrs.hasStreamingCompatibleInterface())
10162 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10163 /*Glue*/ SDValue(),
10165 else
10166 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10167 /*Glue*/ SDValue(), AArch64SME::Always);
10168 Glue = Chain.getValue(1);
10169 }
10170
10171 SmallVector<SDValue, 4> RetOps(1, Chain);
10172 for (auto &RetVal : RetVals) {
10173 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
10174 isPassedInFPR(RetVal.second.getValueType()))
10175 RetVal.second =
10176 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
10177 DAG.getVTList(RetVal.second.getValueType(), MVT::Glue),
10178 RetVal.second);
10179 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
10180 Glue = Chain.getValue(1);
10181 RetOps.push_back(
10182 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
10183 }
10184
10185 // Windows AArch64 ABIs require that for returning structs by value we copy
10186 // the sret argument into X0 for the return.
10187 // We saved the argument into a virtual register in the entry block,
10188 // so now we copy the value out and into X0.
10189 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
10190 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
10192
10193 unsigned RetValReg = AArch64::X0;
10194 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
10195 RetValReg = AArch64::X8;
10196 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
10197 Glue = Chain.getValue(1);
10198
10199 RetOps.push_back(
10200 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
10201 }
10202
10203 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
10204 if (I) {
10205 for (; *I; ++I) {
10206 if (AArch64::GPR64RegClass.contains(*I))
10207 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
10208 else if (AArch64::FPR64RegClass.contains(*I))
10209 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
10210 else
10211 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
10212 }
10213 }
10214
10215 RetOps[0] = Chain; // Update chain.
10216
10217 // Add the glue if we have it.
10218 if (Glue.getNode())
10219 RetOps.push_back(Glue);
10220
10221 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
10222 // ARM64EC entry thunks use a special return sequence: instead of a regular
10223 // "ret" instruction, they need to explicitly call the emulator.
10224 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10225 SDValue Arm64ECRetDest =
10226 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
10227 Arm64ECRetDest =
10228 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
10229 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
10230 MachinePointerInfo());
10231 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
10232 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
10233 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
10234 }
10235
10236 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
10237}
10238
10239//===----------------------------------------------------------------------===//
10240// Other Lowering Code
10241//===----------------------------------------------------------------------===//
10242
10243SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
10244 SelectionDAG &DAG,
10245 unsigned Flag) const {
10246 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
10247 N->getOffset(), Flag);
10248}
10249
10250SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
10251 SelectionDAG &DAG,
10252 unsigned Flag) const {
10253 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
10254}
10255
10256SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
10257 SelectionDAG &DAG,
10258 unsigned Flag) const {
10259 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
10260 N->getOffset(), Flag);
10261}
10262
10263SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
10264 SelectionDAG &DAG,
10265 unsigned Flag) const {
10266 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
10267}
10268
10269SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
10270 SelectionDAG &DAG,
10271 unsigned Flag) const {
10272 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
10273}
10274
10275// (loadGOT sym)
10276template <class NodeTy>
10277SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
10278 unsigned Flags) const {
10279 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
10280 SDLoc DL(N);
10281 EVT Ty = getPointerTy(DAG.getDataLayout());
10282 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
10283 // FIXME: Once remat is capable of dealing with instructions with register
10284 // operands, expand this into two nodes instead of using a wrapper node.
10285 if (DAG.getMachineFunction()
10286 .getInfo<AArch64FunctionInfo>()
10287 ->hasELFSignedGOT())
10288 return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
10289 0);
10290 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
10291}
10292
10293// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
10294template <class NodeTy>
10295SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
10296 unsigned Flags) const {
10297 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
10298 SDLoc DL(N);
10299 EVT Ty = getPointerTy(DAG.getDataLayout());
10300 const unsigned char MO_NC = AArch64II::MO_NC;
10301 return DAG.getNode(
10302 AArch64ISD::WrapperLarge, DL, Ty,
10303 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
10304 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
10305 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
10306 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
10307}
10308
10309// (addlow (adrp %hi(sym)) %lo(sym))
10310template <class NodeTy>
10311SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
10312 unsigned Flags) const {
10313 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
10314 SDLoc DL(N);
10315 EVT Ty = getPointerTy(DAG.getDataLayout());
10316 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
10317 SDValue Lo = getTargetNode(N, Ty, DAG,
10319 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
10320 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
10321}
10322
10323// (adr sym)
10324template <class NodeTy>
10325SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
10326 unsigned Flags) const {
10327 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
10328 SDLoc DL(N);
10329 EVT Ty = getPointerTy(DAG.getDataLayout());
10330 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
10331 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
10332}
10333
10334SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
10335 SelectionDAG &DAG) const {
10336 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
10337 const GlobalValue *GV = GN->getGlobal();
10338 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
10339
10340 if (OpFlags != AArch64II::MO_NO_FLAG)
10342 "unexpected offset in global node");
10343
10344 // This also catches the large code model case for Darwin, and tiny code
10345 // model with got relocations.
10346 if ((OpFlags & AArch64II::MO_GOT) != 0) {
10347 return getGOT(GN, DAG, OpFlags);
10348 }
10349
10353 Result = getAddrLarge(GN, DAG, OpFlags);
10354 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
10355 Result = getAddrTiny(GN, DAG, OpFlags);
10356 } else {
10357 Result = getAddr(GN, DAG, OpFlags);
10358 }
10359 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10360 SDLoc DL(GN);
10362 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
10364 return Result;
10365}
10366
10367/// Convert a TLS address reference into the correct sequence of loads
10368/// and calls to compute the variable's address (for Darwin, currently) and
10369/// return an SDValue containing the final node.
10370
10371/// Darwin only has one TLS scheme which must be capable of dealing with the
10372/// fully general situation, in the worst case. This means:
10373/// + "extern __thread" declaration.
10374/// + Defined in a possibly unknown dynamic library.
10375///
10376/// The general system is that each __thread variable has a [3 x i64] descriptor
10377/// which contains information used by the runtime to calculate the address. The
10378/// only part of this the compiler needs to know about is the first xword, which
10379/// contains a function pointer that must be called with the address of the
10380/// entire descriptor in "x0".
10381///
10382/// Since this descriptor may be in a different unit, in general even the
10383/// descriptor must be accessed via an indirect load. The "ideal" code sequence
10384/// is:
10385/// adrp x0, _var@TLVPPAGE
10386/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
10387/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
10388/// ; the function pointer
10389/// blr x1 ; Uses descriptor address in x0
10390/// ; Address of _var is now in x0.
10391///
10392/// If the address of _var's descriptor *is* known to the linker, then it can
10393/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
10394/// a slight efficiency gain.
10395SDValue
10396AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
10397 SelectionDAG &DAG) const {
10398 assert(Subtarget->isTargetDarwin() &&
10399 "This function expects a Darwin target");
10400
10401 SDLoc DL(Op);
10402 MVT PtrVT = getPointerTy(DAG.getDataLayout());
10403 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10404 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
10405
10406 SDValue TLVPAddr =
10407 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10408 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
10409
10410 // The first entry in the descriptor is a function pointer that we must call
10411 // to obtain the address of the variable.
10412 SDValue Chain = DAG.getEntryNode();
10413 SDValue FuncTLVGet = DAG.getLoad(
10414 PtrMemVT, DL, Chain, DescAddr,
10416 Align(PtrMemVT.getSizeInBits() / 8),
10418 Chain = FuncTLVGet.getValue(1);
10419
10420 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
10421 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
10422
10423 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10424 MFI.setAdjustsStack(true);
10425
10426 // TLS calls preserve all registers except those that absolutely must be
10427 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
10428 // silly).
10429 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10430 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
10431 if (Subtarget->hasCustomCallingConv())
10432 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10433
10434 // Finally, we can make the call. This is just a degenerate version of a
10435 // normal AArch64 call node: x0 takes the address of the descriptor, and
10436 // returns the address of the variable in this thread.
10437 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
10438
10439 unsigned Opcode = AArch64ISD::CALL;
10441 Ops.push_back(Chain);
10442 Ops.push_back(FuncTLVGet);
10443
10444 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
10445 if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
10446 Opcode = AArch64ISD::AUTH_CALL;
10447 Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
10448 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
10449 Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
10450 }
10451
10452 Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
10453 Ops.push_back(DAG.getRegisterMask(Mask));
10454 Ops.push_back(Chain.getValue(1));
10455 Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
10456 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
10457}
10458
10459/// Convert a thread-local variable reference into a sequence of instructions to
10460/// compute the variable's address for the local exec TLS model of ELF targets.
10461/// The sequence depends on the maximum TLS area size.
10462SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
10463 SDValue ThreadBase,
10464 const SDLoc &DL,
10465 SelectionDAG &DAG) const {
10466 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10467 SDValue TPOff, Addr;
10468
10469 switch (DAG.getTarget().Options.TLSSize) {
10470 default:
10471 llvm_unreachable("Unexpected TLS size");
10472
10473 case 12: {
10474 // mrs x0, TPIDR_EL0
10475 // add x0, x0, :tprel_lo12:a
10477 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10478 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10479 Var,
10480 DAG.getTargetConstant(0, DL, MVT::i32)),
10481 0);
10482 }
10483
10484 case 24: {
10485 // mrs x0, TPIDR_EL0
10486 // add x0, x0, :tprel_hi12:a
10487 // add x0, x0, :tprel_lo12_nc:a
10488 SDValue HiVar = DAG.getTargetGlobalAddress(
10489 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10490 SDValue LoVar = DAG.getTargetGlobalAddress(
10491 GV, DL, PtrVT, 0,
10493 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10494 HiVar,
10495 DAG.getTargetConstant(0, DL, MVT::i32)),
10496 0);
10497 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
10498 LoVar,
10499 DAG.getTargetConstant(0, DL, MVT::i32)),
10500 0);
10501 }
10502
10503 case 32: {
10504 // mrs x1, TPIDR_EL0
10505 // movz x0, #:tprel_g1:a
10506 // movk x0, #:tprel_g0_nc:a
10507 // add x0, x1, x0
10508 SDValue HiVar = DAG.getTargetGlobalAddress(
10509 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
10510 SDValue LoVar = DAG.getTargetGlobalAddress(
10511 GV, DL, PtrVT, 0,
10513 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10514 DAG.getTargetConstant(16, DL, MVT::i32)),
10515 0);
10516 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10517 DAG.getTargetConstant(0, DL, MVT::i32)),
10518 0);
10519 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10520 }
10521
10522 case 48: {
10523 // mrs x1, TPIDR_EL0
10524 // movz x0, #:tprel_g2:a
10525 // movk x0, #:tprel_g1_nc:a
10526 // movk x0, #:tprel_g0_nc:a
10527 // add x0, x1, x0
10528 SDValue HiVar = DAG.getTargetGlobalAddress(
10529 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
10530 SDValue MiVar = DAG.getTargetGlobalAddress(
10531 GV, DL, PtrVT, 0,
10533 SDValue LoVar = DAG.getTargetGlobalAddress(
10534 GV, DL, PtrVT, 0,
10536 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10537 DAG.getTargetConstant(32, DL, MVT::i32)),
10538 0);
10539 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
10540 DAG.getTargetConstant(16, DL, MVT::i32)),
10541 0);
10542 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10543 DAG.getTargetConstant(0, DL, MVT::i32)),
10544 0);
10545 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10546 }
10547 }
10548}
10549
10550/// When accessing thread-local variables under either the general-dynamic or
10551/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10552/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10553/// is a function pointer to carry out the resolution.
10554///
10555/// The sequence is:
10556/// adrp x0, :tlsdesc:var
10557/// ldr x1, [x0, #:tlsdesc_lo12:var]
10558/// add x0, x0, #:tlsdesc_lo12:var
10559/// .tlsdesccall var
10560/// blr x1
10561/// (TPIDR_EL0 offset now in x0)
10562///
10563/// The above sequence must be produced unscheduled, to enable the linker to
10564/// optimize/relax this sequence.
10565/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10566/// above sequence, and expanded really late in the compilation flow, to ensure
10567/// the sequence is produced as per above.
10568SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10569 const SDLoc &DL,
10570 SelectionDAG &DAG) const {
10571 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10572
10573 SDValue Chain = DAG.getEntryNode();
10574 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10575
10576 unsigned Opcode =
10577 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
10578 ? AArch64ISD::TLSDESC_AUTH_CALLSEQ
10579 : AArch64ISD::TLSDESC_CALLSEQ;
10580 Chain = DAG.getNode(Opcode, DL, NodeTys, {Chain, SymAddr});
10581 SDValue Glue = Chain.getValue(1);
10582
10583 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
10584}
10585
10586SDValue
10587AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
10588 SelectionDAG &DAG) const {
10589 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
10590
10591 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10592 AArch64FunctionInfo *MFI =
10593 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10594
10598
10600 if (Model == TLSModel::LocalDynamic)
10602 }
10603
10605 Model != TLSModel::LocalExec)
10606 report_fatal_error("ELF TLS only supported in small memory model or "
10607 "in local exec TLS model");
10608 // Different choices can be made for the maximum size of the TLS area for a
10609 // module. For the small address model, the default TLS size is 16MiB and the
10610 // maximum TLS size is 4GiB.
10611 // FIXME: add tiny and large code model support for TLS access models other
10612 // than local exec. We currently generate the same code as small for tiny,
10613 // which may be larger than needed.
10614
10615 SDValue TPOff;
10616 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10617 SDLoc DL(Op);
10618 const GlobalValue *GV = GA->getGlobal();
10619
10620 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
10621
10622 if (Model == TLSModel::LocalExec) {
10623 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
10624 } else if (Model == TLSModel::InitialExec) {
10625 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10626 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
10627 } else if (Model == TLSModel::LocalDynamic) {
10628 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
10629 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
10630 // the beginning of the module's TLS region, followed by a DTPREL offset
10631 // calculation.
10632
10633 // These accesses will need deduplicating if there's more than one.
10635
10636 // The call needs a relocation too for linker relaxation. It doesn't make
10637 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10638 // the address.
10639 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
10641
10642 // Now we can calculate the offset from TPIDR_EL0 to this module's
10643 // thread-local area.
10644 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10645
10646 // Now use :dtprel_whatever: operations to calculate this variable's offset
10647 // in its thread-storage area.
10648 SDValue HiVar = DAG.getTargetGlobalAddress(
10649 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10650 SDValue LoVar = DAG.getTargetGlobalAddress(
10651 GV, DL, MVT::i64, 0,
10653
10654 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
10655 DAG.getTargetConstant(0, DL, MVT::i32)),
10656 0);
10657 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
10658 DAG.getTargetConstant(0, DL, MVT::i32)),
10659 0);
10660 } else if (Model == TLSModel::GeneralDynamic) {
10661 // The call needs a relocation too for linker relaxation. It doesn't make
10662 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10663 // the address.
10664 SDValue SymAddr =
10665 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10666
10667 // Finally we can make a call to calculate the offset from tpidr_el0.
10668 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10669 } else
10670 llvm_unreachable("Unsupported ELF TLS access model");
10671
10672 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10673}
10674
10675SDValue
10676AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
10677 SelectionDAG &DAG) const {
10678 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
10679
10680 SDValue Chain = DAG.getEntryNode();
10681 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10682 SDLoc DL(Op);
10683
10684 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
10685
10686 // Load the ThreadLocalStoragePointer from the TEB
10687 // A pointer to the TLS array is located at offset 0x58 from the TEB.
10688 SDValue TLSArray =
10689 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
10690 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
10691 Chain = TLSArray.getValue(1);
10692
10693 // Load the TLS index from the C runtime;
10694 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
10695 // This also does the same as LOADgot, but using a generic i32 load,
10696 // while LOADgot only loads i64.
10697 SDValue TLSIndexHi =
10698 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
10699 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
10700 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10701 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
10702 SDValue TLSIndex =
10703 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
10704 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
10705 Chain = TLSIndex.getValue(1);
10706
10707 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
10708 // offset into the TLSArray.
10709 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
10710 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
10711 DAG.getConstant(3, DL, PtrVT));
10712 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
10713 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
10714 MachinePointerInfo());
10715 Chain = TLS.getValue(1);
10716
10717 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10718 const GlobalValue *GV = GA->getGlobal();
10719 SDValue TGAHi = DAG.getTargetGlobalAddress(
10720 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10721 SDValue TGALo = DAG.getTargetGlobalAddress(
10722 GV, DL, PtrVT, 0,
10724
10725 // Add the offset from the start of the .tls section (section base).
10726 SDValue Addr =
10727 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
10728 DAG.getTargetConstant(0, DL, MVT::i32)),
10729 0);
10730 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
10731 return Addr;
10732}
10733
10734SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
10735 SelectionDAG &DAG) const {
10736 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10737 if (DAG.getTarget().useEmulatedTLS())
10738 return LowerToTLSEmulatedModel(GA, DAG);
10739
10740 if (Subtarget->isTargetDarwin())
10741 return LowerDarwinGlobalTLSAddress(Op, DAG);
10742 if (Subtarget->isTargetELF())
10743 return LowerELFGlobalTLSAddress(Op, DAG);
10744 if (Subtarget->isTargetWindows())
10745 return LowerWindowsGlobalTLSAddress(Op, DAG);
10746
10747 llvm_unreachable("Unexpected platform trying to use TLS");
10748}
10749
10750//===----------------------------------------------------------------------===//
10751// PtrAuthGlobalAddress lowering
10752//
10753// We have 3 lowering alternatives to choose from:
10754// - MOVaddrPAC: similar to MOVaddr, with added PAC.
10755// If the GV doesn't need a GOT load (i.e., is locally defined)
10756// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
10757//
10758// - LOADgotPAC: similar to LOADgot, with added PAC.
10759// If the GV needs a GOT load, materialize the pointer using the usual
10760// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
10761// section is assumed to be read-only (for example, via relro mechanism). See
10762// LowerMOVaddrPAC.
10763//
10764// - LOADauthptrstatic: similar to LOADgot, but use a
10765// special stub slot instead of a GOT slot.
10766// Load a signed pointer for symbol 'sym' from a stub slot named
10767// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
10768// resolving. This usually lowers to adrp+ldr, but also emits an entry into
10769// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
10770//
10771// All 3 are pseudos that are expand late to longer sequences: this lets us
10772// provide integrity guarantees on the to-be-signed intermediate values.
10773//
10774// LOADauthptrstatic is undesirable because it requires a large section filled
10775// with often similarly-signed pointers, making it a good harvesting target.
10776// Thus, it's only used for ptrauth references to extern_weak to avoid null
10777// checks.
10778
10780 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
10781 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
10782 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
10783 assert(TGN->getGlobal()->hasExternalWeakLinkage());
10784
10785 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
10786 // offset alone as a pointer if the symbol wasn't available, which would
10787 // probably break null checks in users. Ptrauth complicates things further:
10788 // error out.
10789 if (TGN->getOffset() != 0)
10791 "unsupported non-zero offset in weak ptrauth global reference");
10792
10793 if (!isNullConstant(AddrDiscriminator))
10794 report_fatal_error("unsupported weak addr-div ptrauth global");
10795
10796 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10797 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
10798 {TGA, Key, Discriminator}),
10799 0);
10800}
10801
10802SDValue
10803AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
10804 SelectionDAG &DAG) const {
10805 SDValue Ptr = Op.getOperand(0);
10806 uint64_t KeyC = Op.getConstantOperandVal(1);
10807 SDValue AddrDiscriminator = Op.getOperand(2);
10808 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
10809 EVT VT = Op.getValueType();
10810 SDLoc DL(Op);
10811
10812 if (KeyC > AArch64PACKey::LAST)
10813 report_fatal_error("key in ptrauth global out of range [0, " +
10814 Twine((int)AArch64PACKey::LAST) + "]");
10815
10816 // Blend only works if the integer discriminator is 16-bit wide.
10817 if (!isUInt<16>(DiscriminatorC))
10819 "constant discriminator in ptrauth global out of range [0, 0xffff]");
10820
10821 // Choosing between 3 lowering alternatives is target-specific.
10822 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
10823 report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
10824
10825 int64_t PtrOffsetC = 0;
10826 if (Ptr.getOpcode() == ISD::ADD) {
10827 PtrOffsetC = Ptr.getConstantOperandVal(1);
10828 Ptr = Ptr.getOperand(0);
10829 }
10830 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
10831 const GlobalValue *PtrGV = PtrN->getGlobal();
10832
10833 // Classify the reference to determine whether it needs a GOT load.
10834 const unsigned OpFlags =
10835 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
10836 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
10837 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
10838 "unsupported non-GOT op flags on ptrauth global reference");
10839
10840 // Fold any offset into the GV; our pseudos expect it there.
10841 PtrOffsetC += PtrN->getOffset();
10842 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
10843 /*TargetFlags=*/0);
10844 assert(PtrN->getTargetFlags() == 0 &&
10845 "unsupported target flags on ptrauth global");
10846
10847 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10848 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
10849 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
10850 ? AddrDiscriminator
10851 : DAG.getRegister(AArch64::XZR, MVT::i64);
10852
10853 // No GOT load needed -> MOVaddrPAC
10854 if (!NeedsGOTLoad) {
10855 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
10856 return SDValue(
10857 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
10858 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10859 0);
10860 }
10861
10862 // GOT load -> LOADgotPAC
10863 // Note that we disallow extern_weak refs to avoid null checks later.
10864 if (!PtrGV->hasExternalWeakLinkage())
10865 return SDValue(
10866 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
10867 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10868 0);
10869
10870 // extern_weak ref -> LOADauthptrstatic
10872 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
10873 DAG);
10874}
10875
10876// Looks through \param Val to determine the bit that can be used to
10877// check the sign of the value. It returns the unextended value and
10878// the sign bit position.
10879std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
10880 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
10881 return {Val.getOperand(0),
10882 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
10883 1};
10884
10885 if (Val.getOpcode() == ISD::SIGN_EXTEND)
10886 return {Val.getOperand(0),
10887 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
10888
10889 return {Val, Val.getValueSizeInBits() - 1};
10890}
10891
10892SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
10893 SDValue Chain = Op.getOperand(0);
10894 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
10895 SDValue LHS = Op.getOperand(2);
10896 SDValue RHS = Op.getOperand(3);
10897 SDValue Dest = Op.getOperand(4);
10898 SDLoc DL(Op);
10899
10900 MachineFunction &MF = DAG.getMachineFunction();
10901 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
10902 // will not be produced, as they are conditional branch instructions that do
10903 // not set flags.
10904 bool ProduceNonFlagSettingCondBr =
10905 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
10906
10907 // Handle f128 first, since lowering it will result in comparing the return
10908 // value of a libcall against zero, which is just what the rest of LowerBR_CC
10909 // is expecting to deal with.
10910 if (LHS.getValueType() == MVT::f128) {
10911 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
10912
10913 // If softenSetCCOperands returned a scalar, we need to compare the result
10914 // against zero to select between true and false values.
10915 if (!RHS.getNode()) {
10916 RHS = DAG.getConstant(0, DL, LHS.getValueType());
10917 CC = ISD::SETNE;
10918 }
10919 }
10920
10921 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
10922 // instruction.
10924 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
10925 // Only lower legal XALUO ops.
10926 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
10927 return SDValue();
10928
10929 // The actual operation with overflow check.
10931 SDValue Value, Overflow;
10932 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
10933
10934 if (CC == ISD::SETNE)
10935 OFCC = getInvertedCondCode(OFCC);
10936 SDValue CCVal = getCondCode(DAG, OFCC);
10937
10938 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
10939 Overflow);
10940 }
10941
10942 if (LHS.getValueType().isInteger()) {
10943 assert((LHS.getValueType() == RHS.getValueType()) &&
10944 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
10945
10946 // If the RHS of the comparison is zero, we can potentially fold this
10947 // to a specialized branch.
10948 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
10949 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
10950 if (CC == ISD::SETEQ) {
10951 // See if we can use a TBZ to fold in an AND as well.
10952 // TBZ has a smaller branch displacement than CBZ. If the offset is
10953 // out of bounds, a late MI-layer pass rewrites branches.
10954 // 403.gcc is an example that hits this case.
10955 if (LHS.getOpcode() == ISD::AND &&
10956 isa<ConstantSDNode>(LHS.getOperand(1)) &&
10957 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
10958 SDValue Test = LHS.getOperand(0);
10959 uint64_t Mask = LHS.getConstantOperandVal(1);
10960 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, Test,
10961 DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
10962 Dest);
10963 }
10964
10965 return DAG.getNode(AArch64ISD::CBZ, DL, MVT::Other, Chain, LHS, Dest);
10966 } else if (CC == ISD::SETNE) {
10967 // See if we can use a TBZ to fold in an AND as well.
10968 // TBZ has a smaller branch displacement than CBZ. If the offset is
10969 // out of bounds, a late MI-layer pass rewrites branches.
10970 // 403.gcc is an example that hits this case.
10971 if (LHS.getOpcode() == ISD::AND &&
10972 isa<ConstantSDNode>(LHS.getOperand(1)) &&
10973 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
10974 SDValue Test = LHS.getOperand(0);
10975 uint64_t Mask = LHS.getConstantOperandVal(1);
10976 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, Test,
10977 DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
10978 Dest);
10979 }
10980
10981 return DAG.getNode(AArch64ISD::CBNZ, DL, MVT::Other, Chain, LHS, Dest);
10982 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
10983 // Don't combine AND since emitComparison converts the AND to an ANDS
10984 // (a.k.a. TST) and the test in the test bit and branch instruction
10985 // becomes redundant. This would also increase register pressure.
10986 uint64_t SignBitPos;
10987 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
10988 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, LHS,
10989 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
10990 }
10991 }
10992 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
10993 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
10994 // Don't combine AND since emitComparison converts the AND to an ANDS
10995 // (a.k.a. TST) and the test in the test bit and branch instruction
10996 // becomes redundant. This would also increase register pressure.
10997 uint64_t SignBitPos;
10998 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
10999 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, LHS,
11000 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11001 }
11002
11003 // Try to emit Armv9.6 CB instructions. We prefer tb{n}z/cb{n}z due to their
11004 // larger branch displacement but do prefer CB over cmp + br.
11005 if (Subtarget->hasCMPBR() &&
11007 ProduceNonFlagSettingCondBr) {
11008 SDValue Cond =
11010 return DAG.getNode(AArch64ISD::CB, DL, MVT::Other, Chain, Cond, LHS, RHS,
11011 Dest);
11012 }
11013
11014 SDValue CCVal;
11015 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
11016 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
11017 Cmp);
11018 }
11019
11020 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
11021 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11022
11023 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11024 // clean. Some of them require two branches to implement.
11025 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11026 AArch64CC::CondCode CC1, CC2;
11027 changeFPCCToAArch64CC(CC, CC1, CC2);
11028 SDValue CC1Val = getCondCode(DAG, CC1);
11029 SDValue BR1 =
11030 DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CC1Val, Cmp);
11031 if (CC2 != AArch64CC::AL) {
11032 SDValue CC2Val = getCondCode(DAG, CC2);
11033 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, BR1, Dest, CC2Val,
11034 Cmp);
11035 }
11036
11037 return BR1;
11038}
11039
11040SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
11041 SelectionDAG &DAG) const {
11042 if (!Subtarget->isNeonAvailable() &&
11043 !Subtarget->useSVEForFixedLengthVectors())
11044 return SDValue();
11045
11046 EVT VT = Op.getValueType();
11047 EVT IntVT = VT.changeTypeToInteger();
11048 SDLoc DL(Op);
11049
11050 SDValue In1 = Op.getOperand(0);
11051 SDValue In2 = Op.getOperand(1);
11052 EVT SrcVT = In2.getValueType();
11053
11054 if (!SrcVT.bitsEq(VT))
11055 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
11056
11057 if (VT.isScalableVector())
11058 IntVT =
11060
11061 if (VT.isFixedLengthVector() &&
11062 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
11063 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
11064
11065 In1 = convertToScalableVector(DAG, ContainerVT, In1);
11066 In2 = convertToScalableVector(DAG, ContainerVT, In2);
11067
11068 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
11069 return convertFromScalableVector(DAG, VT, Res);
11070 }
11071
11072 // With SVE, but without Neon, extend the scalars to scalable vectors and use
11073 // a SVE FCOPYSIGN.
11074 if (!VT.isVector() && !Subtarget->isNeonAvailable() &&
11075 Subtarget->isSVEorStreamingSVEAvailable()) {
11076 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64 && VT != MVT::bf16)
11077 return SDValue();
11078 EVT SVT = getPackedSVEVectorVT(VT);
11079
11080 SDValue Ins1 =
11081 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In1,
11082 DAG.getConstant(0, DL, MVT::i64));
11083 SDValue Ins2 =
11084 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In2,
11085 DAG.getConstant(0, DL, MVT::i64));
11086 SDValue FCS = DAG.getNode(ISD::FCOPYSIGN, DL, SVT, Ins1, Ins2);
11087 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, FCS,
11088 DAG.getConstant(0, DL, MVT::i64));
11089 }
11090
11091 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
11092 if (VT.isScalableVector())
11093 return getSVESafeBitCast(VT, Op, DAG);
11094
11095 return DAG.getBitcast(VT, Op);
11096 };
11097
11098 SDValue VecVal1, VecVal2;
11099 EVT VecVT;
11100 auto SetVecVal = [&](int Idx = -1) {
11101 if (!VT.isVector()) {
11102 VecVal1 =
11103 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
11104 VecVal2 =
11105 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
11106 } else {
11107 VecVal1 = BitCast(VecVT, In1, DAG);
11108 VecVal2 = BitCast(VecVT, In2, DAG);
11109 }
11110 };
11111 if (VT.isVector()) {
11112 VecVT = IntVT;
11113 SetVecVal();
11114 } else if (VT == MVT::f64) {
11115 VecVT = MVT::v2i64;
11116 SetVecVal(AArch64::dsub);
11117 } else if (VT == MVT::f32) {
11118 VecVT = MVT::v4i32;
11119 SetVecVal(AArch64::ssub);
11120 } else if (VT == MVT::f16 || VT == MVT::bf16) {
11121 VecVT = MVT::v8i16;
11122 SetVecVal(AArch64::hsub);
11123 } else {
11124 llvm_unreachable("Invalid type for copysign!");
11125 }
11126
11127 unsigned BitWidth = In1.getScalarValueSizeInBits();
11128 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
11129
11130 // We want to materialize a mask with every bit but the high bit set, but the
11131 // AdvSIMD immediate moves cannot materialize that in a single instruction for
11132 // 64-bit elements. Instead, materialize all bits set and then negate that.
11133 if (VT == MVT::f64 || VT == MVT::v2f64) {
11134 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
11135 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
11136 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
11137 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
11138 }
11139
11140 SDValue BSP =
11141 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
11142 if (VT == MVT::f16 || VT == MVT::bf16)
11143 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
11144 if (VT == MVT::f32)
11145 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
11146 if (VT == MVT::f64)
11147 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
11148
11149 return BitCast(VT, BSP, DAG);
11150}
11151
11152SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
11153 SelectionDAG &DAG) const {
11155 Attribute::NoImplicitFloat))
11156 return SDValue();
11157
11158 EVT VT = Op.getValueType();
11159 if (VT.isScalableVector() ||
11160 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
11161 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
11162
11163 bool IsParity = Op.getOpcode() == ISD::PARITY;
11164 SDValue Val = Op.getOperand(0);
11165 SDLoc DL(Op);
11166
11167 // for i32, general parity function using EORs is more efficient compared to
11168 // using floating point
11169 if (VT == MVT::i32 && IsParity)
11170 return SDValue();
11171
11172 if (Subtarget->isSVEorStreamingSVEAvailable()) {
11173 if (VT == MVT::i32 || VT == MVT::i64) {
11174 EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;
11175 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
11176 DAG.getUNDEF(ContainerVT), Val,
11177 DAG.getVectorIdxConstant(0, DL));
11178 Val = DAG.getNode(ISD::CTPOP, DL, ContainerVT, Val);
11179 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Val,
11180 DAG.getVectorIdxConstant(0, DL));
11181 if (IsParity)
11182 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11183 return Val;
11184 }
11185
11186 if (VT == MVT::i128) {
11187 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Val);
11188 Val = convertToScalableVector(DAG, MVT::nxv2i64, Val);
11189 Val = DAG.getNode(ISD::CTPOP, DL, MVT::nxv2i64, Val);
11190 Val = convertFromScalableVector(DAG, MVT::v2i64, Val);
11191 Val = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Val);
11192 Val = DAG.getZExtOrTrunc(Val, DL, VT);
11193 if (IsParity)
11194 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11195 return Val;
11196 }
11197 }
11198
11199 if (!Subtarget->isNeonAvailable())
11200 return SDValue();
11201
11202 // If there is no CNT instruction available, GPR popcount can
11203 // be more efficiently lowered to the following sequence that uses
11204 // AdvSIMD registers/instructions as long as the copies to/from
11205 // the AdvSIMD registers are cheap.
11206 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
11207 // CNT V0.8B, V0.8B // 8xbyte pop-counts
11208 // ADDV B0, V0.8B // sum 8xbyte pop-counts
11209 // FMOV X0, D0 // copy result back to integer reg
11210 if (VT == MVT::i32 || VT == MVT::i64) {
11211 if (VT == MVT::i32)
11212 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
11213 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
11214
11215 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
11216 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
11217 AddV = DAG.getNode(AArch64ISD::NVCAST, DL,
11218 VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);
11219 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,
11220 DAG.getConstant(0, DL, MVT::i64));
11221 if (IsParity)
11222 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11223 return AddV;
11224 } else if (VT == MVT::i128) {
11225 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
11226
11227 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
11228 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
11229 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
11230 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),
11231 DAG.getConstant(0, DL, MVT::i64));
11232 AddV = DAG.getZExtOrTrunc(AddV, DL, VT);
11233 if (IsParity)
11234 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11235 return AddV;
11236 }
11237
11238 assert(!IsParity && "ISD::PARITY of vector types not supported");
11239
11240 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
11241 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
11242 "Unexpected type for custom ctpop lowering");
11243
11244 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
11245 Val = DAG.getBitcast(VT8Bit, Val);
11246 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
11247
11248 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
11249 VT.getVectorNumElements() >= 2) {
11250 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
11251 SDValue Zeros = DAG.getConstant(0, DL, DT);
11252 SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
11253
11254 if (VT == MVT::v2i64) {
11255 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11256 Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
11257 } else if (VT == MVT::v2i32) {
11258 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11259 } else if (VT == MVT::v4i32) {
11260 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11261 } else {
11262 llvm_unreachable("Unexpected type for custom ctpop lowering");
11263 }
11264
11265 return Val;
11266 }
11267
11268 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
11269 unsigned EltSize = 8;
11270 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
11271 while (EltSize != VT.getScalarSizeInBits()) {
11272 EltSize *= 2;
11273 NumElts /= 2;
11274 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
11275 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
11276 }
11277
11278 return Val;
11279}
11280
11281SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
11282 EVT VT = Op.getValueType();
11283 assert(VT.isScalableVector() ||
11285 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
11286
11287 SDLoc DL(Op);
11288 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
11289 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
11290}
11291
11292SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
11293 SelectionDAG &DAG) const {
11294
11295 EVT VT = Op.getValueType();
11296 SDLoc DL(Op);
11297 unsigned Opcode = Op.getOpcode();
11298 ISD::CondCode CC;
11299 switch (Opcode) {
11300 default:
11301 llvm_unreachable("Wrong instruction");
11302 case ISD::SMAX:
11303 CC = ISD::SETGT;
11304 break;
11305 case ISD::SMIN:
11306 CC = ISD::SETLT;
11307 break;
11308 case ISD::UMAX:
11309 CC = ISD::SETUGT;
11310 break;
11311 case ISD::UMIN:
11312 CC = ISD::SETULT;
11313 break;
11314 }
11315
11316 if (VT.isScalableVector() ||
11318 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
11319 switch (Opcode) {
11320 default:
11321 llvm_unreachable("Wrong instruction");
11322 case ISD::SMAX:
11323 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
11324 case ISD::SMIN:
11325 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
11326 case ISD::UMAX:
11327 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
11328 case ISD::UMIN:
11329 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
11330 }
11331 }
11332
11333 SDValue Op0 = Op.getOperand(0);
11334 SDValue Op1 = Op.getOperand(1);
11335 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
11336 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
11337}
11338
11339SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
11340 SelectionDAG &DAG) const {
11341 EVT VT = Op.getValueType();
11342
11343 if (VT.isScalableVector() ||
11345 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
11346 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
11347
11348 SDLoc DL(Op);
11349 SDValue REVB;
11350 MVT VST;
11351
11352 switch (VT.getSimpleVT().SimpleTy) {
11353 default:
11354 llvm_unreachable("Invalid type for bitreverse!");
11355
11356 case MVT::v2i32: {
11357 VST = MVT::v8i8;
11358 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11359
11360 break;
11361 }
11362
11363 case MVT::v4i32: {
11364 VST = MVT::v16i8;
11365 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11366
11367 break;
11368 }
11369
11370 case MVT::v1i64: {
11371 VST = MVT::v8i8;
11372 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11373
11374 break;
11375 }
11376
11377 case MVT::v2i64: {
11378 VST = MVT::v16i8;
11379 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11380
11381 break;
11382 }
11383 }
11384
11385 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
11386 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
11387}
11388
11389// Check whether the continuous comparison sequence.
11390static bool
11391isOrXorChain(SDValue N, unsigned &Num,
11392 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
11393 if (Num == MaxXors)
11394 return false;
11395
11396 // Skip the one-use zext
11397 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
11398 N = N->getOperand(0);
11399
11400 // The leaf node must be XOR
11401 if (N->getOpcode() == ISD::XOR) {
11402 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
11403 Num++;
11404 return true;
11405 }
11406
11407 // All the non-leaf nodes must be OR.
11408 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
11409 return false;
11410
11411 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
11412 isOrXorChain(N->getOperand(1), Num, WorkList))
11413 return true;
11414 return false;
11415}
11416
11417// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
11419 SDValue LHS = N->getOperand(0);
11420 SDValue RHS = N->getOperand(1);
11421 SDLoc DL(N);
11422 EVT VT = N->getValueType(0);
11424
11425 // Only handle integer compares.
11426 if (N->getOpcode() != ISD::SETCC)
11427 return SDValue();
11428
11429 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
11430 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
11431 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
11432 unsigned NumXors = 0;
11433 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
11434 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
11435 isOrXorChain(LHS, NumXors, WorkList)) {
11436 SDValue XOR0, XOR1;
11437 std::tie(XOR0, XOR1) = WorkList[0];
11438 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
11439 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11440 for (unsigned I = 1; I < WorkList.size(); I++) {
11441 std::tie(XOR0, XOR1) = WorkList[I];
11442 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11443 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
11444 }
11445
11446 // Exit early by inverting the condition, which help reduce indentations.
11447 return Cmp;
11448 }
11449
11450 return SDValue();
11451}
11452
11453SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
11454
11455 if (Op.getValueType().isVector())
11456 return LowerVSETCC(Op, DAG);
11457
11458 bool IsStrict = Op->isStrictFPOpcode();
11459 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
11460 unsigned OpNo = IsStrict ? 1 : 0;
11461 SDValue Chain;
11462 if (IsStrict)
11463 Chain = Op.getOperand(0);
11464 SDValue LHS = Op.getOperand(OpNo + 0);
11465 SDValue RHS = Op.getOperand(OpNo + 1);
11466 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
11467 SDLoc DL(Op);
11468
11469 // We chose ZeroOrOneBooleanContents, so use zero and one.
11470 EVT VT = Op.getValueType();
11471 SDValue TVal = DAG.getConstant(1, DL, VT);
11472 SDValue FVal = DAG.getConstant(0, DL, VT);
11473
11474 // Handle f128 first, since one possible outcome is a normal integer
11475 // comparison which gets picked up by the next if statement.
11476 if (LHS.getValueType() == MVT::f128) {
11477 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS, Chain,
11478 IsSignaling);
11479
11480 // If softenSetCCOperands returned a scalar, use it.
11481 if (!RHS.getNode()) {
11482 assert(LHS.getValueType() == Op.getValueType() &&
11483 "Unexpected setcc expansion!");
11484 return IsStrict ? DAG.getMergeValues({LHS, Chain}, DL) : LHS;
11485 }
11486 }
11487
11488 if (LHS.getValueType().isInteger()) {
11489
11490 simplifySetCCIntoEq(CC, LHS, RHS, DAG, DL);
11491
11492 SDValue CCVal;
11494 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, DL);
11495
11496 // Note that we inverted the condition above, so we reverse the order of
11497 // the true and false operands here. This will allow the setcc to be
11498 // matched to a single CSINC instruction.
11499 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CCVal, Cmp);
11500 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11501 }
11502
11503 // Now we know we're dealing with FP values.
11504 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11505 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11506
11507 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11508 // and do the comparison.
11509 SDValue Cmp;
11510 if (IsStrict)
11511 Cmp = emitStrictFPComparison(LHS, RHS, DL, DAG, Chain, IsSignaling);
11512 else
11513 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11514
11515 AArch64CC::CondCode CC1, CC2;
11516 changeFPCCToAArch64CC(CC, CC1, CC2);
11517 SDValue Res;
11518 if (CC2 == AArch64CC::AL) {
11519 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
11520 CC2);
11521 SDValue CC1Val = getCondCode(DAG, CC1);
11522
11523 // Note that we inverted the condition above, so we reverse the order of
11524 // the true and false operands here. This will allow the setcc to be
11525 // matched to a single CSINC instruction.
11526 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CC1Val, Cmp);
11527 } else {
11528 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
11529 // totally clean. Some of them require two CSELs to implement. As is in
11530 // this case, we emit the first CSEL and then emit a second using the output
11531 // of the first as the RHS. We're effectively OR'ing the two CC's together.
11532
11533 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
11534 SDValue CC1Val = getCondCode(DAG, CC1);
11535 SDValue CS1 =
11536 DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
11537
11538 SDValue CC2Val = getCondCode(DAG, CC2);
11539 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
11540 }
11541 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, DL) : Res;
11542}
11543
11544SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
11545 SelectionDAG &DAG) const {
11546
11547 SDValue LHS = Op.getOperand(0);
11548 SDValue RHS = Op.getOperand(1);
11549 EVT VT = LHS.getValueType();
11550 if (VT != MVT::i32 && VT != MVT::i64)
11551 return SDValue();
11552
11553 SDLoc DL(Op);
11554 SDValue Carry = Op.getOperand(2);
11555 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
11556 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
11557 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT),
11558 LHS, RHS, InvCarry);
11559
11560 EVT OpVT = Op.getValueType();
11561 SDValue TVal = DAG.getConstant(1, DL, OpVT);
11562 SDValue FVal = DAG.getConstant(0, DL, OpVT);
11563
11564 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
11566 SDValue CCVal = getCondCode(DAG, changeIntCCToAArch64CC(CondInv));
11567 // Inputs are swapped because the condition is inverted. This will allow
11568 // matching with a single CSINC instruction.
11569 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
11570 Cmp.getValue(1));
11571}
11572
11573/// Emit vector comparison for floating-point values, producing a mask.
11575 AArch64CC::CondCode CC, bool NoNans, EVT VT,
11576 const SDLoc &DL, SelectionDAG &DAG) {
11577 assert(VT.getSizeInBits() == LHS.getValueType().getSizeInBits() &&
11578 "function only supposed to emit natural comparisons");
11579
11580 switch (CC) {
11581 default:
11582 return SDValue();
11583 case AArch64CC::NE: {
11584 SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
11585 // Use vector semantics for the inversion to potentially save a copy between
11586 // SIMD and regular registers.
11587 if (!LHS.getValueType().isVector()) {
11588 EVT VecVT =
11589 EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
11590 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11591 SDValue MaskVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT,
11592 DAG.getUNDEF(VecVT), Fcmeq, Zero);
11593 SDValue InvertedMask = DAG.getNOT(DL, MaskVec, VecVT);
11594 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, InvertedMask, Zero);
11595 }
11596 return DAG.getNOT(DL, Fcmeq, VT);
11597 }
11598 case AArch64CC::EQ:
11599 return DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
11600 case AArch64CC::GE:
11601 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, LHS, RHS);
11602 case AArch64CC::GT:
11603 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, LHS, RHS);
11604 case AArch64CC::LE:
11605 if (!NoNans)
11606 return SDValue();
11607 // If we ignore NaNs then we can use to the LS implementation.
11608 [[fallthrough]];
11609 case AArch64CC::LS:
11610 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, RHS, LHS);
11611 case AArch64CC::LT:
11612 if (!NoNans)
11613 return SDValue();
11614 // If we ignore NaNs then we can use to the MI implementation.
11615 [[fallthrough]];
11616 case AArch64CC::MI:
11617 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, RHS, LHS);
11618 }
11619}
11620
11621/// For SELECT_CC, when the true/false values are (-1, 0) and the compared
11622/// values are scalars, try to emit a mask generating vector instruction.
11624 SDValue FVal, ISD::CondCode CC, bool NoNaNs,
11625 const SDLoc &DL, SelectionDAG &DAG) {
11626 assert(!LHS.getValueType().isVector());
11627 assert(!RHS.getValueType().isVector());
11628
11629 auto *CTVal = dyn_cast<ConstantSDNode>(TVal);
11630 auto *CFVal = dyn_cast<ConstantSDNode>(FVal);
11631 if (!CTVal || !CFVal)
11632 return {};
11633 if (!(CTVal->isAllOnes() && CFVal->isZero()) &&
11634 !(CTVal->isZero() && CFVal->isAllOnes()))
11635 return {};
11636
11637 if (CTVal->isZero())
11638 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11639
11640 EVT VT = TVal.getValueType();
11641 if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits())
11642 return {};
11643
11644 if (!NoNaNs && (CC == ISD::SETUO || CC == ISD::SETO)) {
11645 bool OneNaN = false;
11646 if (LHS == RHS) {
11647 OneNaN = true;
11648 } else if (DAG.isKnownNeverNaN(RHS)) {
11649 OneNaN = true;
11650 RHS = LHS;
11651 } else if (DAG.isKnownNeverNaN(LHS)) {
11652 OneNaN = true;
11653 LHS = RHS;
11654 }
11655 if (OneNaN)
11656 CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ;
11657 }
11658
11661 bool ShouldInvert = false;
11662 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
11663 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, DL, DAG);
11664 SDValue Cmp2;
11665 if (CC2 != AArch64CC::AL) {
11666 Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, VT, DL, DAG);
11667 if (!Cmp2)
11668 return {};
11669 }
11670 if (!Cmp2 && !ShouldInvert)
11671 return Cmp;
11672
11673 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
11674 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11675 Cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT), Cmp,
11676 Zero);
11677 if (Cmp2) {
11678 Cmp2 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT),
11679 Cmp2, Zero);
11680 Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp, Cmp2);
11681 }
11682 if (ShouldInvert)
11683 Cmp = DAG.getNOT(DL, Cmp, VecVT);
11684 Cmp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cmp, Zero);
11685 return Cmp;
11686}
11687
11688SDValue AArch64TargetLowering::LowerSELECT_CC(
11691 const SDLoc &DL, SelectionDAG &DAG) const {
11692 // Handle f128 first, because it will result in a comparison of some RTLIB
11693 // call result against zero.
11694 if (LHS.getValueType() == MVT::f128) {
11695 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
11696
11697 // If softenSetCCOperands returned a scalar, we need to compare the result
11698 // against zero to select between true and false values.
11699 if (!RHS.getNode()) {
11700 RHS = DAG.getConstant(0, DL, LHS.getValueType());
11701 CC = ISD::SETNE;
11702 }
11703 }
11704
11705 // Also handle f16, for which we need to do a f32 comparison.
11706 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
11707 LHS.getValueType() == MVT::bf16) {
11708 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
11709 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
11710 }
11711
11712 // Next, handle integers.
11713 if (LHS.getValueType().isInteger()) {
11714 assert((LHS.getValueType() == RHS.getValueType()) &&
11715 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11716
11717 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
11718 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
11719 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11720
11721 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
11722 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
11723 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
11724 // Both require less instructions than compare and conditional select.
11725 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
11726 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
11727 LHS.getValueType() == RHS.getValueType()) {
11728 EVT VT = LHS.getValueType();
11729 SDValue Shift =
11730 DAG.getNode(ISD::SRA, DL, VT, LHS,
11731 DAG.getConstant(VT.getSizeInBits() - 1, DL, VT));
11732
11733 if (CC == ISD::SETGT)
11734 Shift = DAG.getNOT(DL, Shift, VT);
11735
11736 return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
11737 }
11738
11739 // Canonicalise absolute difference patterns:
11740 // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
11741 // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
11742 //
11743 // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc ->
11744 // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc
11745 // The second forms can be matched into subs+cneg.
11746 // NOTE: Drop poison generating flags from the negated operand to avoid
11747 // inadvertently propagating poison after the canonicalisation.
11748 if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) {
11749 if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS &&
11750 FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) {
11752 FVal = DAG.getNegative(TVal, DL, TVal.getValueType());
11753 } else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&
11754 FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) {
11756 TVal = DAG.getNegative(FVal, DL, FVal.getValueType());
11757 }
11758 }
11759
11760 unsigned Opcode = AArch64ISD::CSEL;
11761
11762 // If both the TVal and the FVal are constants, see if we can swap them in
11763 // order to for a CSINV or CSINC out of them.
11764 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
11765 std::swap(TVal, FVal);
11766 std::swap(CTVal, CFVal);
11767 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11768 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
11769 std::swap(TVal, FVal);
11770 std::swap(CTVal, CFVal);
11771 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11772 } else if (TVal.getOpcode() == ISD::XOR) {
11773 // If TVal is a NOT we want to swap TVal and FVal so that we can match
11774 // with a CSINV rather than a CSEL.
11775 if (isAllOnesConstant(TVal.getOperand(1))) {
11776 std::swap(TVal, FVal);
11777 std::swap(CTVal, CFVal);
11778 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11779 }
11780 } else if (TVal.getOpcode() == ISD::SUB) {
11781 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
11782 // that we can match with a CSNEG rather than a CSEL.
11783 if (isNullConstant(TVal.getOperand(0))) {
11784 std::swap(TVal, FVal);
11785 std::swap(CTVal, CFVal);
11786 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11787 }
11788 } else if (CTVal && CFVal) {
11789 const int64_t TrueVal = CTVal->getSExtValue();
11790 const int64_t FalseVal = CFVal->getSExtValue();
11791 bool Swap = false;
11792
11793 // If both TVal and FVal are constants, see if FVal is the
11794 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
11795 // instead of a CSEL in that case.
11796 if (TrueVal == ~FalseVal) {
11797 Opcode = AArch64ISD::CSINV;
11798 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
11799 TrueVal == -FalseVal) {
11800 Opcode = AArch64ISD::CSNEG;
11801 } else if (TVal.getValueType() == MVT::i32) {
11802 // If our operands are only 32-bit wide, make sure we use 32-bit
11803 // arithmetic for the check whether we can use CSINC. This ensures that
11804 // the addition in the check will wrap around properly in case there is
11805 // an overflow (which would not be the case if we do the check with
11806 // 64-bit arithmetic).
11807 const uint32_t TrueVal32 = CTVal->getZExtValue();
11808 const uint32_t FalseVal32 = CFVal->getZExtValue();
11809
11810 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
11811 Opcode = AArch64ISD::CSINC;
11812
11813 if (TrueVal32 > FalseVal32) {
11814 Swap = true;
11815 }
11816 }
11817 } else {
11818 // 64-bit check whether we can use CSINC.
11819 const uint64_t TrueVal64 = TrueVal;
11820 const uint64_t FalseVal64 = FalseVal;
11821
11822 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
11823 Opcode = AArch64ISD::CSINC;
11824
11825 if (TrueVal > FalseVal) {
11826 Swap = true;
11827 }
11828 }
11829 }
11830
11831 // Swap TVal and FVal if necessary.
11832 if (Swap) {
11833 std::swap(TVal, FVal);
11834 std::swap(CTVal, CFVal);
11835 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11836 }
11837
11838 if (Opcode != AArch64ISD::CSEL) {
11839 // Drop FVal since we can get its value by simply inverting/negating
11840 // TVal.
11841 FVal = TVal;
11842 }
11843 }
11844
11845 // Avoid materializing a constant when possible by reusing a known value in
11846 // a register. However, don't perform this optimization if the known value
11847 // is one, zero or negative one in the case of a CSEL. We can always
11848 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
11849 // FVal, respectively.
11850 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
11851 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
11852 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
11854 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
11855 // "a != C ? x : a" to avoid materializing C.
11856 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
11857 TVal = LHS;
11858 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
11859 FVal = LHS;
11860 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
11861 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
11862 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
11863 // avoid materializing C.
11865 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
11866 Opcode = AArch64ISD::CSINV;
11867 TVal = LHS;
11868 FVal = DAG.getConstant(0, DL, FVal.getValueType());
11869 }
11870 }
11871
11872 SDValue CCVal;
11873 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
11874 EVT VT = TVal.getValueType();
11875 return DAG.getNode(Opcode, DL, VT, TVal, FVal, CCVal, Cmp);
11876 }
11877
11878 // Now we know we're dealing with FP values.
11879 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
11880 LHS.getValueType() == MVT::f64);
11881 assert(LHS.getValueType() == RHS.getValueType());
11882 EVT VT = TVal.getValueType();
11883
11884 // If the purpose of the comparison is to select between all ones
11885 // or all zeros, try to use a vector comparison because the operands are
11886 // already stored in SIMD registers.
11887 if (Subtarget->isNeonAvailable() && all_of(Users, [](const SDNode *U) {
11888 switch (U->getOpcode()) {
11889 default:
11890 return false;
11893 case AArch64ISD::DUP:
11894 return true;
11895 }
11896 })) {
11897 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs();
11898 SDValue VectorCmp =
11899 emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
11900 if (VectorCmp)
11901 return VectorCmp;
11902 }
11903
11904 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11905
11906 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11907 // clean. Some of them require two CSELs to implement.
11908 AArch64CC::CondCode CC1, CC2;
11909 changeFPCCToAArch64CC(CC, CC1, CC2);
11910
11911 if (Flags.hasNoSignedZeros()) {
11912 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
11913 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
11914 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
11915 if (RHSVal && RHSVal->isZero()) {
11916 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
11917 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
11918
11919 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
11920 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
11921 TVal = LHS;
11922 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
11923 CFVal && CFVal->isZero() &&
11924 FVal.getValueType() == LHS.getValueType())
11925 FVal = LHS;
11926 }
11927 }
11928
11929 // Emit first, and possibly only, CSEL.
11930 SDValue CC1Val = getCondCode(DAG, CC1);
11931 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
11932
11933 // If we need a second CSEL, emit it, using the output of the first as the
11934 // RHS. We're effectively OR'ing the two CC's together.
11935 if (CC2 != AArch64CC::AL) {
11936 SDValue CC2Val = getCondCode(DAG, CC2);
11937 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
11938 }
11939
11940 // Otherwise, return the output of the first CSEL.
11941 return CS1;
11942}
11943
11944SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
11945 SelectionDAG &DAG) const {
11946 EVT Ty = Op.getValueType();
11947 auto Idx = Op.getConstantOperandAPInt(2);
11948 int64_t IdxVal = Idx.getSExtValue();
11949 assert(Ty.isScalableVector() &&
11950 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
11951
11952 // We can use the splice instruction for certain index values where we are
11953 // able to efficiently generate the correct predicate. The index will be
11954 // inverted and used directly as the input to the ptrue instruction, i.e.
11955 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
11956 // splice predicate. However, we can only do this if we can guarantee that
11957 // there are enough elements in the vector, hence we check the index <= min
11958 // number of elements.
11959 std::optional<unsigned> PredPattern;
11960 if (Ty.isScalableVector() && IdxVal < 0 &&
11961 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
11962 std::nullopt) {
11963 SDLoc DL(Op);
11964
11965 // Create a predicate where all but the last -IdxVal elements are false.
11966 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
11967 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
11968 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
11969
11970 // Now splice the two inputs together using the predicate.
11971 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
11972 Op.getOperand(1));
11973 }
11974
11975 // We can select to an EXT instruction when indexing the first 256 bytes.
11977 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
11978 return Op;
11979
11980 return SDValue();
11981}
11982
11983SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
11984 SelectionDAG &DAG) const {
11985 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
11986 SDValue LHS = Op.getOperand(0);
11987 SDValue RHS = Op.getOperand(1);
11988 SDValue TVal = Op.getOperand(2);
11989 SDValue FVal = Op.getOperand(3);
11990 SDNodeFlags Flags = Op->getFlags();
11991 SDLoc DL(Op);
11992 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);
11993}
11994
11995SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
11996 SelectionDAG &DAG) const {
11997 SDValue CCVal = Op->getOperand(0);
11998 SDValue TVal = Op->getOperand(1);
11999 SDValue FVal = Op->getOperand(2);
12000 SDLoc DL(Op);
12001
12002 EVT Ty = Op.getValueType();
12003 if (Ty == MVT::aarch64svcount) {
12004 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
12005 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
12006 SDValue Sel =
12007 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
12008 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
12009 }
12010
12011 if (Ty.isScalableVector()) {
12012 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
12013 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
12014 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12015 }
12016
12017 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
12018 // FIXME: Ideally this would be the same as above using i1 types, however
12019 // for the moment we can't deal with fixed i1 vector types properly, so
12020 // instead extend the predicate to a result type sized integer vector.
12021 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
12022 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
12023 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
12024 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
12025 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12026 }
12027
12028 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
12029 // instruction.
12030 if (ISD::isOverflowIntrOpRes(CCVal)) {
12031 // Only lower legal XALUO ops.
12032 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
12033 return SDValue();
12034
12036 SDValue Value, Overflow;
12037 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
12038 SDValue CCVal = getCondCode(DAG, OFCC);
12039
12040 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
12041 CCVal, Overflow);
12042 }
12043
12044 // Lower it the same way as we would lower a SELECT_CC node.
12045 ISD::CondCode CC;
12046 SDValue LHS, RHS;
12047 if (CCVal.getOpcode() == ISD::SETCC) {
12048 LHS = CCVal.getOperand(0);
12049 RHS = CCVal.getOperand(1);
12050 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
12051 } else {
12052 LHS = CCVal;
12053 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
12054 CC = ISD::SETNE;
12055 }
12056
12057 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
12058 // order to use FCSELSrrr
12059 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12060 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
12061 DAG.getUNDEF(MVT::f32), TVal);
12062 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
12063 DAG.getUNDEF(MVT::f32), FVal);
12064 }
12065
12066 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),
12067 Op->getFlags(), DL, DAG);
12068
12069 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12070 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
12071 }
12072
12073 return Res;
12074}
12075
12076SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
12077 SelectionDAG &DAG) const {
12078 // Jump table entries as PC relative offsets. No additional tweaking
12079 // is necessary here. Just get the address of the jump table.
12080 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
12081
12084 !Subtarget->isTargetMachO())
12085 return getAddrLarge(JT, DAG);
12086 if (CM == CodeModel::Tiny)
12087 return getAddrTiny(JT, DAG);
12088 return getAddr(JT, DAG);
12089}
12090
12091SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
12092 SelectionDAG &DAG) const {
12093 // Jump table entries as PC relative offsets. No additional tweaking
12094 // is necessary here. Just get the address of the jump table.
12095 SDLoc DL(Op);
12096 SDValue JT = Op.getOperand(1);
12097 SDValue Entry = Op.getOperand(2);
12098 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
12099
12100 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12101 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
12102
12103 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
12104 // sequence later, to guarantee the integrity of the intermediate values.
12106 "aarch64-jump-table-hardening")) {
12108 if (Subtarget->isTargetMachO()) {
12109 if (CM != CodeModel::Small && CM != CodeModel::Large)
12110 report_fatal_error("Unsupported code-model for hardened jump-table");
12111 } else {
12112 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
12113 assert(Subtarget->isTargetELF() &&
12114 "jump table hardening only supported on MachO/ELF");
12115 if (CM != CodeModel::Small)
12116 report_fatal_error("Unsupported code-model for hardened jump-table");
12117 }
12118
12119 SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
12120 Entry, SDValue());
12121 SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
12122 DAG.getTargetJumpTable(JTI, MVT::i32),
12123 X16Copy.getValue(0), X16Copy.getValue(1));
12124 return SDValue(B, 0);
12125 }
12126
12127 SDNode *Dest =
12128 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
12129 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
12130 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
12131 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
12132}
12133
12134SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
12135 SDValue Chain = Op.getOperand(0);
12136 SDValue Dest = Op.getOperand(1);
12137
12138 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
12139 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
12140 if (Dest->isMachineOpcode() &&
12141 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
12142 return SDValue();
12143
12144 const MachineFunction &MF = DAG.getMachineFunction();
12145 std::optional<uint16_t> BADisc =
12146 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(MF.getFunction());
12147 if (!BADisc)
12148 return SDValue();
12149
12150 SDLoc DL(Op);
12151
12152 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12154 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12155
12156 SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
12157 {Dest, Key, Disc, AddrDisc, Chain});
12158 return SDValue(BrA, 0);
12159}
12160
12161SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
12162 SelectionDAG &DAG) const {
12163 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
12165 if (CM == CodeModel::Large) {
12166 // Use the GOT for the large code model on iOS.
12167 if (Subtarget->isTargetMachO()) {
12168 return getGOT(CP, DAG);
12169 }
12171 return getAddrLarge(CP, DAG);
12172 } else if (CM == CodeModel::Tiny) {
12173 return getAddrTiny(CP, DAG);
12174 }
12175 return getAddr(CP, DAG);
12176}
12177
12178SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
12179 SelectionDAG &DAG) const {
12180 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
12181 const BlockAddress *BA = BAN->getBlockAddress();
12182
12183 if (std::optional<uint16_t> BADisc =
12184 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(
12185 *BA->getFunction())) {
12186 SDLoc DL(Op);
12187
12188 // This isn't cheap, but BRIND is rare.
12189 SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
12190
12191 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12192
12194 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12195
12196 SDNode *MOV =
12197 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
12198 {TargetBA, Key, AddrDisc, Disc});
12199 return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
12200 SDValue(MOV, 1));
12201 }
12202
12204 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
12206 return getAddrLarge(BAN, DAG);
12207 } else if (CM == CodeModel::Tiny) {
12208 return getAddrTiny(BAN, DAG);
12209 }
12210 return getAddr(BAN, DAG);
12211}
12212
12213SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
12214 SelectionDAG &DAG) const {
12215 AArch64FunctionInfo *FuncInfo =
12216 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12217
12218 SDLoc DL(Op);
12219 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
12221 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
12222 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12223 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12224 MachinePointerInfo(SV));
12225}
12226
12227SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
12228 SelectionDAG &DAG) const {
12229 MachineFunction &MF = DAG.getMachineFunction();
12230 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12231
12232 SDLoc DL(Op);
12233 SDValue FR;
12234 if (Subtarget->isWindowsArm64EC()) {
12235 // With the Arm64EC ABI, we compute the address of the varargs save area
12236 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
12237 // but calls from an entry thunk can pass in a different address.
12238 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
12239 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
12240 uint64_t StackOffset;
12241 if (FuncInfo->getVarArgsGPRSize() > 0)
12242 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
12243 else
12244 StackOffset = FuncInfo->getVarArgsStackOffset();
12245 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
12246 DAG.getConstant(StackOffset, DL, MVT::i64));
12247 } else {
12248 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
12249 ? FuncInfo->getVarArgsGPRIndex()
12250 : FuncInfo->getVarArgsStackIndex(),
12252 }
12253 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12254 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12255 MachinePointerInfo(SV));
12256}
12257
12258SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
12259 SelectionDAG &DAG) const {
12260 // The layout of the va_list struct is specified in the AArch64 Procedure Call
12261 // Standard, section B.3.
12262 MachineFunction &MF = DAG.getMachineFunction();
12263 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12264 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12265 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12266 auto PtrVT = getPointerTy(DAG.getDataLayout());
12267 SDLoc DL(Op);
12268
12269 SDValue Chain = Op.getOperand(0);
12270 SDValue VAList = Op.getOperand(1);
12271 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12273
12274 // void *__stack at offset 0
12275 unsigned Offset = 0;
12276 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
12277 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
12278 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
12279 MachinePointerInfo(SV), Align(PtrSize)));
12280
12281 // void *__gr_top at offset 8 (4 on ILP32)
12282 Offset += PtrSize;
12283 int GPRSize = FuncInfo->getVarArgsGPRSize();
12284 if (GPRSize > 0) {
12285 SDValue GRTop, GRTopAddr;
12286
12287 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12288 DAG.getConstant(Offset, DL, PtrVT));
12289
12290 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
12291 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
12292 DAG.getSignedConstant(GPRSize, DL, PtrVT));
12293 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
12294
12295 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
12296 MachinePointerInfo(SV, Offset),
12297 Align(PtrSize)));
12298 }
12299
12300 // void *__vr_top at offset 16 (8 on ILP32)
12301 Offset += PtrSize;
12302 int FPRSize = FuncInfo->getVarArgsFPRSize();
12303 if (FPRSize > 0) {
12304 SDValue VRTop, VRTopAddr;
12305 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12306 DAG.getConstant(Offset, DL, PtrVT));
12307
12308 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
12309 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
12310 DAG.getSignedConstant(FPRSize, DL, PtrVT));
12311 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
12312
12313 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
12314 MachinePointerInfo(SV, Offset),
12315 Align(PtrSize)));
12316 }
12317
12318 // int __gr_offs at offset 24 (12 on ILP32)
12319 Offset += PtrSize;
12320 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12321 DAG.getConstant(Offset, DL, PtrVT));
12322 MemOps.push_back(
12323 DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),
12324 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12325
12326 // int __vr_offs at offset 28 (16 on ILP32)
12327 Offset += 4;
12328 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12329 DAG.getConstant(Offset, DL, PtrVT));
12330 MemOps.push_back(
12331 DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),
12332 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12333
12334 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
12335}
12336
12337SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
12338 SelectionDAG &DAG) const {
12339 MachineFunction &MF = DAG.getMachineFunction();
12340 Function &F = MF.getFunction();
12341
12342 if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
12343 return LowerWin64_VASTART(Op, DAG);
12344 else if (Subtarget->isTargetDarwin())
12345 return LowerDarwin_VASTART(Op, DAG);
12346 else
12347 return LowerAAPCS_VASTART(Op, DAG);
12348}
12349
12350SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
12351 SelectionDAG &DAG) const {
12352 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
12353 // pointer.
12354 SDLoc DL(Op);
12355 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12356 unsigned VaListSize =
12357 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
12358 ? PtrSize
12359 : Subtarget->isTargetILP32() ? 20 : 32;
12360 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
12361 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
12362
12363 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
12364 DAG.getConstant(VaListSize, DL, MVT::i32),
12365 Align(PtrSize), false, false, /*CI=*/nullptr,
12366 std::nullopt, MachinePointerInfo(DestSV),
12367 MachinePointerInfo(SrcSV));
12368}
12369
12370SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
12371 assert(Subtarget->isTargetDarwin() &&
12372 "automatic va_arg instruction only works on Darwin");
12373
12374 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12375 EVT VT = Op.getValueType();
12376 SDLoc DL(Op);
12377 SDValue Chain = Op.getOperand(0);
12378 SDValue Addr = Op.getOperand(1);
12379 MaybeAlign Align(Op.getConstantOperandVal(3));
12380 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
12381 auto PtrVT = getPointerTy(DAG.getDataLayout());
12382 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12383 SDValue VAList =
12384 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
12385 Chain = VAList.getValue(1);
12386 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
12387
12388 if (VT.isScalableVector())
12389 report_fatal_error("Passing SVE types to variadic functions is "
12390 "currently not supported");
12391
12392 if (Align && *Align > MinSlotSize) {
12393 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12394 DAG.getConstant(Align->value() - 1, DL, PtrVT));
12395 VAList =
12396 DAG.getNode(ISD::AND, DL, PtrVT, VAList,
12397 DAG.getSignedConstant(-(int64_t)Align->value(), DL, PtrVT));
12398 }
12399
12400 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
12401 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
12402
12403 // Scalar integer and FP values smaller than 64 bits are implicitly extended
12404 // up to 64 bits. At the very least, we have to increase the striding of the
12405 // vaargs list to match this, and for FP values we need to introduce
12406 // FP_ROUND nodes as well.
12407 if (VT.isInteger() && !VT.isVector())
12408 ArgSize = std::max(ArgSize, MinSlotSize);
12409 bool NeedFPTrunc = false;
12410 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
12411 ArgSize = 8;
12412 NeedFPTrunc = true;
12413 }
12414
12415 // Increment the pointer, VAList, to the next vaarg
12416 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12417 DAG.getConstant(ArgSize, DL, PtrVT));
12418 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
12419
12420 // Store the incremented VAList to the legalized pointer
12421 SDValue APStore =
12422 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
12423
12424 // Load the actual argument out of the pointer VAList
12425 if (NeedFPTrunc) {
12426 // Load the value as an f64.
12427 SDValue WideFP =
12428 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
12429 // Round the value down to an f32.
12430 SDValue NarrowFP =
12431 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
12432 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
12433 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
12434 // Merge the rounded value with the chain output of the load.
12435 return DAG.getMergeValues(Ops, DL);
12436 }
12437
12438 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
12439}
12440
12441SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
12442 SelectionDAG &DAG) const {
12443 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12444 MFI.setFrameAddressIsTaken(true);
12445
12446 EVT VT = Op.getValueType();
12447 SDLoc DL(Op);
12448 unsigned Depth = Op.getConstantOperandVal(0);
12449 SDValue FrameAddr =
12450 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
12451 while (Depth--)
12452 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
12453 MachinePointerInfo());
12454
12455 if (Subtarget->isTargetILP32())
12456 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
12457 DAG.getValueType(VT));
12458
12459 return FrameAddr;
12460}
12461
12462SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
12463 SelectionDAG &DAG) const {
12464 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12465
12466 EVT VT = getPointerTy(DAG.getDataLayout());
12467 int FI = MFI.CreateFixedObject(4, 0, false);
12468 return DAG.getFrameIndex(FI, VT);
12469}
12470
12471#define GET_REGISTER_MATCHER
12472#include "AArch64GenAsmMatcher.inc"
12473
12474// FIXME? Maybe this could be a TableGen attribute on some registers and
12475// this table could be generated automatically from RegInfo.
12476Register AArch64TargetLowering::
12477getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
12479 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
12480 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
12481 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
12482 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
12483 !MRI->isReservedReg(MF, Reg))
12484 Reg = Register();
12485 }
12486 return Reg;
12487}
12488
12489SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
12490 SelectionDAG &DAG) const {
12492
12493 EVT VT = Op.getValueType();
12494 SDLoc DL(Op);
12495
12496 SDValue FrameAddr =
12497 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
12499
12500 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
12501}
12502
12503SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
12504 SelectionDAG &DAG) const {
12505 MachineFunction &MF = DAG.getMachineFunction();
12506 MachineFrameInfo &MFI = MF.getFrameInfo();
12507 MFI.setReturnAddressIsTaken(true);
12508
12509 EVT VT = Op.getValueType();
12510 SDLoc DL(Op);
12511 unsigned Depth = Op.getConstantOperandVal(0);
12512 SDValue ReturnAddress;
12513 if (Depth) {
12514 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
12516 ReturnAddress = DAG.getLoad(
12517 VT, DL, DAG.getEntryNode(),
12518 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
12519 } else {
12520 // Return LR, which contains the return address. Mark it an implicit
12521 // live-in.
12522 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
12523 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
12524 }
12525
12526 // The XPACLRI instruction assembles to a hint-space instruction before
12527 // Armv8.3-A therefore this instruction can be safely used for any pre
12528 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
12529 // that instead.
12530 SDNode *St;
12531 if (Subtarget->hasPAuth()) {
12532 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
12533 } else {
12534 // XPACLRI operates on LR therefore we must move the operand accordingly.
12535 SDValue Chain =
12536 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
12537 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
12538 }
12539 return SDValue(St, 0);
12540}
12541
12542/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
12543/// i32 values and take a 2 x i32 value to shift plus a shift amount.
12544SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
12545 SelectionDAG &DAG) const {
12546 SDValue Lo, Hi;
12547 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
12548 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
12549}
12550
12552 const GlobalAddressSDNode *GA) const {
12553 // Offsets are folded in the DAG combine rather than here so that we can
12554 // intelligently choose an offset based on the uses.
12555 return false;
12556}
12557
12559 bool OptForSize) const {
12560 bool IsLegal = false;
12561 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
12562 // 16-bit case when target has full fp16 support.
12563 // We encode bf16 bit patterns as if they were fp16. This results in very
12564 // strange looking assembly but should populate the register with appropriate
12565 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
12566 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
12567 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
12568 // FIXME: We should be able to handle f128 as well with a clever lowering.
12569 const APInt ImmInt = Imm.bitcastToAPInt();
12570 if (VT == MVT::f64)
12571 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
12572 else if (VT == MVT::f32)
12573 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
12574 else if (VT == MVT::f16 || VT == MVT::bf16)
12575 IsLegal =
12576 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
12577 Imm.isPosZero();
12578
12579 // If we can not materialize in immediate field for fmov, check if the
12580 // value can be encoded as the immediate operand of a logical instruction.
12581 // The immediate value will be created with either MOVZ, MOVN, or ORR.
12582 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
12583 // generate that fmov.
12584 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
12585 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
12586 // however the mov+fmov sequence is always better because of the reduced
12587 // cache pressure. The timings are still the same if you consider
12588 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
12589 // movw+movk is fused). So we limit up to 2 instrdduction at most.
12592 assert(Insn.size() <= 4 &&
12593 "Should be able to build any value with at most 4 moves");
12594 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
12595 IsLegal = Insn.size() <= Limit;
12596 }
12597
12598 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
12599 << " imm value: "; Imm.dump(););
12600 return IsLegal;
12601}
12602
12603//===----------------------------------------------------------------------===//
12604// AArch64 Optimization Hooks
12605//===----------------------------------------------------------------------===//
12606
12607static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
12608 SDValue Operand, SelectionDAG &DAG,
12609 int &ExtraSteps) {
12610 EVT VT = Operand.getValueType();
12611 if ((ST->hasNEON() &&
12612 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
12613 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
12614 VT == MVT::v4f32)) ||
12615 (ST->hasSVE() &&
12616 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
12618 // For the reciprocal estimates, convergence is quadratic, so the number
12619 // of digits is doubled after each iteration. In ARMv8, the accuracy of
12620 // the initial estimate is 2^-8. Thus the number of extra steps to refine
12621 // the result for float (23 mantissa bits) is 2 and for double (52
12622 // mantissa bits) is 3.
12623 constexpr unsigned AccurateBits = 8;
12624 unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
12625 ExtraSteps = DesiredBits <= AccurateBits
12626 ? 0
12627 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
12628 }
12629
12630 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
12631 }
12632
12633 return SDValue();
12634}
12635
12636SDValue
12637AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
12638 const DenormalMode &Mode) const {
12639 SDLoc DL(Op);
12640 EVT VT = Op.getValueType();
12641 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
12642 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
12643 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
12644}
12645
12646SDValue
12647AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
12648 SelectionDAG &DAG) const {
12649 return Op;
12650}
12651
12652SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
12653 SelectionDAG &DAG, int Enabled,
12654 int &ExtraSteps,
12655 bool &UseOneConst,
12656 bool Reciprocal) const {
12658 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
12659 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
12660 DAG, ExtraSteps)) {
12661 SDLoc DL(Operand);
12662 EVT VT = Operand.getValueType();
12663
12664 // Ensure nodes can be recognized by isAssociativeAndCommutative.
12665 SDNodeFlags Flags =
12667
12668 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
12669 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
12670 for (int i = ExtraSteps; i > 0; --i) {
12671 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
12672 Flags);
12673 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
12674 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12675 }
12676 if (!Reciprocal)
12677 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
12678
12679 ExtraSteps = 0;
12680 return Estimate;
12681 }
12682
12683 return SDValue();
12684}
12685
12686SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
12687 SelectionDAG &DAG, int Enabled,
12688 int &ExtraSteps) const {
12690 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
12691 DAG, ExtraSteps)) {
12692 SDLoc DL(Operand);
12693 EVT VT = Operand.getValueType();
12694
12696
12697 // Newton reciprocal iteration: E * (2 - X * E)
12698 // AArch64 reciprocal iteration instruction: (2 - M * N)
12699 for (int i = ExtraSteps; i > 0; --i) {
12700 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
12701 Estimate, Flags);
12702 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12703 }
12704
12705 ExtraSteps = 0;
12706 return Estimate;
12707 }
12708
12709 return SDValue();
12710}
12711
12712//===----------------------------------------------------------------------===//
12713// AArch64 Inline Assembly Support
12714//===----------------------------------------------------------------------===//
12715
12716// Table of Constraints
12717// TODO: This is the current set of constraints supported by ARM for the
12718// compiler, not all of them may make sense.
12719//
12720// r - A general register
12721// w - An FP/SIMD register of some size in the range v0-v31
12722// x - An FP/SIMD register of some size in the range v0-v15
12723// I - Constant that can be used with an ADD instruction
12724// J - Constant that can be used with a SUB instruction
12725// K - Constant that can be used with a 32-bit logical instruction
12726// L - Constant that can be used with a 64-bit logical instruction
12727// M - Constant that can be used as a 32-bit MOV immediate
12728// N - Constant that can be used as a 64-bit MOV immediate
12729// Q - A memory reference with base register and no offset
12730// S - A symbolic address
12731// Y - Floating point constant zero
12732// Z - Integer constant zero
12733//
12734// Note that general register operands will be output using their 64-bit x
12735// register name, whatever the size of the variable, unless the asm operand
12736// is prefixed by the %w modifier. Floating-point and SIMD register operands
12737// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
12738// %q modifier.
12739const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
12740 // At this point, we have to lower this constraint to something else, so we
12741 // lower it to an "r" or "w". However, by doing this we will force the result
12742 // to be in register, while the X constraint is much more permissive.
12743 //
12744 // Although we are correct (we are free to emit anything, without
12745 // constraints), we might break use cases that would expect us to be more
12746 // efficient and emit something else.
12747 if (!Subtarget->hasFPARMv8())
12748 return "r";
12749
12750 if (ConstraintVT.isFloatingPoint())
12751 return "w";
12752
12753 if (ConstraintVT.isVector() &&
12754 (ConstraintVT.getSizeInBits() == 64 ||
12755 ConstraintVT.getSizeInBits() == 128))
12756 return "w";
12757
12758 return "r";
12759}
12760
12762
12763// Returns a {Reg, RegisterClass} tuple if the constraint is
12764// a specific predicate register.
12765//
12766// For some constraint like "{pn3}" the default path in
12767// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
12768// suitable register class for this register is "PPRorPNR", after which it
12769// determines that nxv16i1 is an appropriate type for the constraint, which is
12770// not what we want. The code here pre-empts this by matching the register
12771// explicitly.
12772static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
12774 if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
12775 (Constraint[1] != 'p' && Constraint[1] != 'z'))
12776 return std::nullopt;
12777
12778 bool IsPredicate = Constraint[1] == 'p';
12779 Constraint = Constraint.substr(2, Constraint.size() - 3);
12780 bool IsPredicateAsCount = IsPredicate && Constraint.starts_with("n");
12781 if (IsPredicateAsCount)
12782 Constraint = Constraint.drop_front(1);
12783
12784 unsigned V;
12785 if (Constraint.getAsInteger(10, V) || V > 31)
12786 return std::nullopt;
12787
12788 if (IsPredicateAsCount)
12789 return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
12790 if (IsPredicate)
12791 return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
12792 return std::make_pair(AArch64::Z0 + V, &AArch64::ZPRRegClass);
12793}
12794
12795static std::optional<PredicateConstraint>
12798 .Case("Uph", PredicateConstraint::Uph)
12801 .Default(std::nullopt);
12802}
12803
12804static const TargetRegisterClass *
12806 if (VT != MVT::aarch64svcount &&
12807 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
12808 return nullptr;
12809
12810 switch (Constraint) {
12812 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
12813 : &AArch64::PPR_p8to15RegClass;
12815 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
12816 : &AArch64::PPR_3bRegClass;
12818 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
12819 : &AArch64::PPRRegClass;
12820 }
12821
12822 llvm_unreachable("Missing PredicateConstraint!");
12823}
12824
12826
12827static std::optional<ReducedGprConstraint>
12830 .Case("Uci", ReducedGprConstraint::Uci)
12832 .Default(std::nullopt);
12833}
12834
12835static const TargetRegisterClass *
12837 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
12838 return nullptr;
12839
12840 switch (Constraint) {
12842 return &AArch64::MatrixIndexGPR32_8_11RegClass;
12844 return &AArch64::MatrixIndexGPR32_12_15RegClass;
12845 }
12846
12847 llvm_unreachable("Missing ReducedGprConstraint!");
12848}
12849
12850// The set of cc code supported is from
12851// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
12854 .Case("{@cchi}", AArch64CC::HI)
12855 .Case("{@cccs}", AArch64CC::HS)
12856 .Case("{@cclo}", AArch64CC::LO)
12857 .Case("{@ccls}", AArch64CC::LS)
12858 .Case("{@cccc}", AArch64CC::LO)
12859 .Case("{@cceq}", AArch64CC::EQ)
12860 .Case("{@ccgt}", AArch64CC::GT)
12861 .Case("{@ccge}", AArch64CC::GE)
12862 .Case("{@cclt}", AArch64CC::LT)
12863 .Case("{@ccle}", AArch64CC::LE)
12864 .Case("{@cchs}", AArch64CC::HS)
12865 .Case("{@ccne}", AArch64CC::NE)
12866 .Case("{@ccvc}", AArch64CC::VC)
12867 .Case("{@ccpl}", AArch64CC::PL)
12868 .Case("{@ccvs}", AArch64CC::VS)
12869 .Case("{@ccmi}", AArch64CC::MI)
12871 return Cond;
12872}
12873
12874/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
12875/// WZR, invert(<cond>)'.
12877 SelectionDAG &DAG) {
12878 return DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
12879 DAG.getConstant(0, DL, MVT::i32),
12880 DAG.getConstant(0, DL, MVT::i32),
12881 getCondCode(DAG, getInvertedCondCode(CC)), NZCV);
12882}
12883
12884// Lower @cc flag output via getSETCC.
12885SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
12886 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
12887 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
12888 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
12889 if (Cond == AArch64CC::Invalid)
12890 return SDValue();
12891 // The output variable should be a scalar integer.
12892 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
12893 OpInfo.ConstraintVT.getSizeInBits() < 8)
12894 report_fatal_error("Flag output operand is of invalid type");
12895
12896 // Get NZCV register. Only update chain when copyfrom is glued.
12897 if (Glue.getNode()) {
12898 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue);
12899 Chain = Glue.getValue(1);
12900 } else
12901 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT);
12902 // Extract CC code.
12903 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
12904
12906
12907 // Truncate or ZERO_EXTEND based on value types.
12908 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
12909 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
12910 else
12911 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
12912
12913 return Result;
12914}
12915
12916/// getConstraintType - Given a constraint letter, return the type of
12917/// constraint it is for this target.
12919AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
12920 if (Constraint.size() == 1) {
12921 switch (Constraint[0]) {
12922 default:
12923 break;
12924 case 'x':
12925 case 'w':
12926 case 'y':
12927 return C_RegisterClass;
12928 // An address with a single base register. Due to the way we
12929 // currently handle addresses it is the same as 'r'.
12930 case 'Q':
12931 return C_Memory;
12932 case 'I':
12933 case 'J':
12934 case 'K':
12935 case 'L':
12936 case 'M':
12937 case 'N':
12938 case 'Y':
12939 case 'Z':
12940 return C_Immediate;
12941 case 'z':
12942 case 'S': // A symbol or label reference with a constant offset
12943 return C_Other;
12944 }
12945 } else if (parsePredicateConstraint(Constraint))
12946 return C_RegisterClass;
12947 else if (parseReducedGprConstraint(Constraint))
12948 return C_RegisterClass;
12949 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
12950 return C_Other;
12951 return TargetLowering::getConstraintType(Constraint);
12952}
12953
12954/// Examine constraint type and operand type and determine a weight value.
12955/// This object must already have been set up with the operand type
12956/// and the current alternative constraint selected.
12958AArch64TargetLowering::getSingleConstraintMatchWeight(
12959 AsmOperandInfo &info, const char *constraint) const {
12961 Value *CallOperandVal = info.CallOperandVal;
12962 // If we don't have a value, we can't do a match,
12963 // but allow it at the lowest weight.
12964 if (!CallOperandVal)
12965 return CW_Default;
12966 Type *type = CallOperandVal->getType();
12967 // Look at the constraint type.
12968 switch (*constraint) {
12969 default:
12971 break;
12972 case 'x':
12973 case 'w':
12974 case 'y':
12975 if (type->isFloatingPointTy() || type->isVectorTy())
12976 weight = CW_Register;
12977 break;
12978 case 'z':
12979 weight = CW_Constant;
12980 break;
12981 case 'U':
12982 if (parsePredicateConstraint(constraint) ||
12983 parseReducedGprConstraint(constraint))
12984 weight = CW_Register;
12985 break;
12986 }
12987 return weight;
12988}
12989
12990std::pair<unsigned, const TargetRegisterClass *>
12991AArch64TargetLowering::getRegForInlineAsmConstraint(
12992 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
12993 if (Constraint.size() == 1) {
12994 switch (Constraint[0]) {
12995 case 'r':
12996 if (VT.isScalableVector())
12997 return std::make_pair(0U, nullptr);
12998 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
12999 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
13000 if (VT.getFixedSizeInBits() == 64)
13001 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
13002 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
13003 case 'w': {
13004 if (!Subtarget->hasFPARMv8())
13005 break;
13006 if (VT.isScalableVector()) {
13007 if (VT.getVectorElementType() != MVT::i1)
13008 return std::make_pair(0U, &AArch64::ZPRRegClass);
13009 return std::make_pair(0U, nullptr);
13010 }
13011 if (VT == MVT::Other)
13012 break;
13013 uint64_t VTSize = VT.getFixedSizeInBits();
13014 if (VTSize == 16)
13015 return std::make_pair(0U, &AArch64::FPR16RegClass);
13016 if (VTSize == 32)
13017 return std::make_pair(0U, &AArch64::FPR32RegClass);
13018 if (VTSize == 64)
13019 return std::make_pair(0U, &AArch64::FPR64RegClass);
13020 if (VTSize == 128)
13021 return std::make_pair(0U, &AArch64::FPR128RegClass);
13022 break;
13023 }
13024 // The instructions that this constraint is designed for can
13025 // only take 128-bit registers so just use that regclass.
13026 case 'x':
13027 if (!Subtarget->hasFPARMv8())
13028 break;
13029 if (VT.isScalableVector())
13030 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
13031 if (VT.getSizeInBits() == 128)
13032 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
13033 break;
13034 case 'y':
13035 if (!Subtarget->hasFPARMv8())
13036 break;
13037 if (VT.isScalableVector())
13038 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
13039 break;
13040 }
13041 } else {
13042 if (const auto P = parseSVERegAsConstraint(Constraint)) {
13043 // SME functions that are not in streaming mode, should
13044 // still observe clobbers of Z-registers by clobbering
13045 // the lower 128bits of those registers.
13046 if (AArch64::ZPRRegClass.hasSubClassEq(P->second) &&
13047 !Subtarget->isSVEorStreamingSVEAvailable())
13048 return std::make_pair(TRI->getSubReg(P->first, AArch64::zsub),
13049 &AArch64::FPR128RegClass);
13050 return *P;
13051 }
13052 if (const auto PC = parsePredicateConstraint(Constraint))
13053 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
13054 return std::make_pair(0U, RegClass);
13055
13056 if (const auto RGC = parseReducedGprConstraint(Constraint))
13057 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
13058 return std::make_pair(0U, RegClass);
13059 }
13060 if (StringRef("{cc}").equals_insensitive(Constraint) ||
13062 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
13063
13064 if (Constraint == "{za}") {
13065 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
13066 }
13067
13068 if (Constraint == "{zt0}") {
13069 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
13070 }
13071
13072 // Use the default implementation in TargetLowering to convert the register
13073 // constraint into a member of a register class.
13074 std::pair<unsigned, const TargetRegisterClass *> Res;
13076
13077 // Not found as a standard register?
13078 if (!Res.second) {
13079 unsigned Size = Constraint.size();
13080 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
13081 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
13082 int RegNo;
13083 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
13084 if (!Failed && RegNo >= 0 && RegNo <= 31) {
13085 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
13086 // By default we'll emit v0-v31 for this unless there's a modifier where
13087 // we'll emit the correct register as well.
13088 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
13089 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
13090 Res.second = &AArch64::FPR64RegClass;
13091 } else {
13092 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
13093 Res.second = &AArch64::FPR128RegClass;
13094 }
13095 }
13096 }
13097 }
13098
13099 if (Res.second && !Subtarget->hasFPARMv8() &&
13100 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
13101 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
13102 return std::make_pair(0U, nullptr);
13103
13104 return Res;
13105}
13106
13108 llvm::Type *Ty,
13109 bool AllowUnknown) const {
13110 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
13111 return EVT(MVT::i64x8);
13112
13113 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
13114}
13115
13116/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
13117/// vector. If it is invalid, don't add anything to Ops.
13118void AArch64TargetLowering::LowerAsmOperandForConstraint(
13119 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
13120 SelectionDAG &DAG) const {
13121 SDValue Result;
13122
13123 // Currently only support length 1 constraints.
13124 if (Constraint.size() != 1)
13125 return;
13126
13127 char ConstraintLetter = Constraint[0];
13128 switch (ConstraintLetter) {
13129 default:
13130 break;
13131
13132 // This set of constraints deal with valid constants for various instructions.
13133 // Validate and return a target constant for them if we can.
13134 case 'z': {
13135 // 'z' maps to xzr or wzr so it needs an input of 0.
13136 if (!isNullConstant(Op))
13137 return;
13138
13139 if (Op.getValueType() == MVT::i64)
13140 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
13141 else
13142 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
13143 break;
13144 }
13145 case 'S':
13146 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
13147 // supported for PIC while "s" isn't, making "s" less useful. We implement
13148 // "S" but not "s".
13150 break;
13151
13152 case 'I':
13153 case 'J':
13154 case 'K':
13155 case 'L':
13156 case 'M':
13157 case 'N':
13159 if (!C)
13160 return;
13161
13162 // Grab the value and do some validation.
13163 uint64_t CVal = C->getZExtValue();
13164 switch (ConstraintLetter) {
13165 // The I constraint applies only to simple ADD or SUB immediate operands:
13166 // i.e. 0 to 4095 with optional shift by 12
13167 // The J constraint applies only to ADD or SUB immediates that would be
13168 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
13169 // instruction [or vice versa], in other words -1 to -4095 with optional
13170 // left shift by 12.
13171 case 'I':
13172 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
13173 break;
13174 return;
13175 case 'J': {
13176 uint64_t NVal = -C->getSExtValue();
13177 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
13178 CVal = C->getSExtValue();
13179 break;
13180 }
13181 return;
13182 }
13183 // The K and L constraints apply *only* to logical immediates, including
13184 // what used to be the MOVI alias for ORR (though the MOVI alias has now
13185 // been removed and MOV should be used). So these constraints have to
13186 // distinguish between bit patterns that are valid 32-bit or 64-bit
13187 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
13188 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
13189 // versa.
13190 case 'K':
13191 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13192 break;
13193 return;
13194 case 'L':
13195 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13196 break;
13197 return;
13198 // The M and N constraints are a superset of K and L respectively, for use
13199 // with the MOV (immediate) alias. As well as the logical immediates they
13200 // also match 32 or 64-bit immediates that can be loaded either using a
13201 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
13202 // (M) or 64-bit 0x1234000000000000 (N) etc.
13203 // As a note some of this code is liberally stolen from the asm parser.
13204 case 'M': {
13205 if (!isUInt<32>(CVal))
13206 return;
13207 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13208 break;
13209 if ((CVal & 0xFFFF) == CVal)
13210 break;
13211 if ((CVal & 0xFFFF0000ULL) == CVal)
13212 break;
13213 uint64_t NCVal = ~(uint32_t)CVal;
13214 if ((NCVal & 0xFFFFULL) == NCVal)
13215 break;
13216 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13217 break;
13218 return;
13219 }
13220 case 'N': {
13221 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13222 break;
13223 if ((CVal & 0xFFFFULL) == CVal)
13224 break;
13225 if ((CVal & 0xFFFF0000ULL) == CVal)
13226 break;
13227 if ((CVal & 0xFFFF00000000ULL) == CVal)
13228 break;
13229 if ((CVal & 0xFFFF000000000000ULL) == CVal)
13230 break;
13231 uint64_t NCVal = ~CVal;
13232 if ((NCVal & 0xFFFFULL) == NCVal)
13233 break;
13234 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13235 break;
13236 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
13237 break;
13238 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
13239 break;
13240 return;
13241 }
13242 default:
13243 return;
13244 }
13245
13246 // All assembler immediates are 64-bit integers.
13247 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
13248 break;
13249 }
13250
13251 if (Result.getNode()) {
13252 Ops.push_back(Result);
13253 return;
13254 }
13255
13256 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
13257}
13258
13259//===----------------------------------------------------------------------===//
13260// AArch64 Advanced SIMD Support
13261//===----------------------------------------------------------------------===//
13262
13263/// WidenVector - Given a value in the V64 register class, produce the
13264/// equivalent value in the V128 register class.
13266 EVT VT = V64Reg.getValueType();
13267 unsigned NarrowSize = VT.getVectorNumElements();
13268 MVT EltTy = VT.getVectorElementType().getSimpleVT();
13269 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
13270 SDLoc DL(V64Reg);
13271
13272 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
13273 V64Reg, DAG.getConstant(0, DL, MVT::i64));
13274}
13275
13276/// getExtFactor - Determine the adjustment factor for the position when
13277/// generating an "extract from vector registers" instruction.
13278static unsigned getExtFactor(SDValue &V) {
13279 EVT EltType = V.getValueType().getVectorElementType();
13280 return EltType.getSizeInBits() / 8;
13281}
13282
13283// Check if a vector is built from one vector via extracted elements of
13284// another together with an AND mask, ensuring that all elements fit
13285// within range. This can be reconstructed using AND and NEON's TBL1.
13287 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13288 SDLoc DL(Op);
13289 EVT VT = Op.getValueType();
13290 assert(!VT.isScalableVector() &&
13291 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13292
13293 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
13294 // directly to TBL1.
13295 if (VT != MVT::v16i8 && VT != MVT::v8i8)
13296 return SDValue();
13297
13298 unsigned NumElts = VT.getVectorNumElements();
13299 assert((NumElts == 8 || NumElts == 16) &&
13300 "Need to have exactly 8 or 16 elements in vector.");
13301
13302 SDValue SourceVec;
13303 SDValue MaskSourceVec;
13304 SmallVector<SDValue, 16> AndMaskConstants;
13305
13306 for (unsigned i = 0; i < NumElts; ++i) {
13307 SDValue V = Op.getOperand(i);
13308 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13309 return SDValue();
13310
13311 SDValue OperandSourceVec = V.getOperand(0);
13312 if (!SourceVec)
13313 SourceVec = OperandSourceVec;
13314 else if (SourceVec != OperandSourceVec)
13315 return SDValue();
13316
13317 // This only looks at shuffles with elements that are
13318 // a) truncated by a constant AND mask extracted from a mask vector, or
13319 // b) extracted directly from a mask vector.
13320 SDValue MaskSource = V.getOperand(1);
13321 if (MaskSource.getOpcode() == ISD::AND) {
13322 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
13323 return SDValue();
13324
13325 AndMaskConstants.push_back(MaskSource.getOperand(1));
13326 MaskSource = MaskSource->getOperand(0);
13327 } else if (!AndMaskConstants.empty()) {
13328 // Either all or no operands should have an AND mask.
13329 return SDValue();
13330 }
13331
13332 // An ANY_EXTEND may be inserted between the AND and the source vector
13333 // extraction. We don't care about that, so we can just skip it.
13334 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
13335 MaskSource = MaskSource.getOperand(0);
13336
13337 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13338 return SDValue();
13339
13340 SDValue MaskIdx = MaskSource.getOperand(1);
13341 if (!isa<ConstantSDNode>(MaskIdx) ||
13342 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
13343 return SDValue();
13344
13345 // We only apply this if all elements come from the same vector with the
13346 // same vector type.
13347 if (!MaskSourceVec) {
13348 MaskSourceVec = MaskSource->getOperand(0);
13349 if (MaskSourceVec.getValueType() != VT)
13350 return SDValue();
13351 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
13352 return SDValue();
13353 }
13354 }
13355
13356 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
13357 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
13358 // insert, we know that the index in the mask must be smaller than the number
13359 // of elements in the source, or we would have an out-of-bounds access.
13360 if (NumElts == 8)
13361 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, SourceVec,
13362 DAG.getUNDEF(VT));
13363
13364 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
13365 if (!AndMaskConstants.empty())
13366 MaskSourceVec = DAG.getNode(ISD::AND, DL, VT, MaskSourceVec,
13367 DAG.getBuildVector(VT, DL, AndMaskConstants));
13368
13369 return DAG.getNode(
13371 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), SourceVec,
13372 MaskSourceVec);
13373}
13374
13375// Gather data to see if the operation can be modelled as a
13376// shuffle in combination with VEXTs.
13378 SelectionDAG &DAG) const {
13379 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13380 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
13381 SDLoc DL(Op);
13382 EVT VT = Op.getValueType();
13383 assert(!VT.isScalableVector() &&
13384 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13385 unsigned NumElts = VT.getVectorNumElements();
13386
13387 struct ShuffleSourceInfo {
13388 SDValue Vec;
13389 unsigned MinElt;
13390 unsigned MaxElt;
13391
13392 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
13393 // be compatible with the shuffle we intend to construct. As a result
13394 // ShuffleVec will be some sliding window into the original Vec.
13395 SDValue ShuffleVec;
13396
13397 // Code should guarantee that element i in Vec starts at element "WindowBase
13398 // + i * WindowScale in ShuffleVec".
13399 int WindowBase;
13400 int WindowScale;
13401
13402 ShuffleSourceInfo(SDValue Vec)
13403 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
13404 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
13405
13406 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
13407 };
13408
13409 // First gather all vectors used as an immediate source for this BUILD_VECTOR
13410 // node.
13412 for (unsigned i = 0; i < NumElts; ++i) {
13413 SDValue V = Op.getOperand(i);
13414 if (V.isUndef())
13415 continue;
13416 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13417 !isa<ConstantSDNode>(V.getOperand(1)) ||
13418 V.getOperand(0).getValueType().isScalableVector()) {
13419 LLVM_DEBUG(
13420 dbgs() << "Reshuffle failed: "
13421 "a shuffle can only come from building a vector from "
13422 "various elements of other fixed-width vectors, provided "
13423 "their indices are constant\n");
13424 return SDValue();
13425 }
13426
13427 // Add this element source to the list if it's not already there.
13428 SDValue SourceVec = V.getOperand(0);
13429 auto Source = find(Sources, SourceVec);
13430 if (Source == Sources.end())
13431 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
13432
13433 // Update the minimum and maximum lane number seen.
13434 unsigned EltNo = V.getConstantOperandVal(1);
13435 Source->MinElt = std::min(Source->MinElt, EltNo);
13436 Source->MaxElt = std::max(Source->MaxElt, EltNo);
13437 }
13438
13439 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
13440 // better than moving to/from gpr registers for larger vectors.
13441 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
13442 // Construct a mask for the tbl. We may need to adjust the index for types
13443 // larger than i8.
13445 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
13446 for (unsigned I = 0; I < NumElts; ++I) {
13447 SDValue V = Op.getOperand(I);
13448 if (V.isUndef()) {
13449 for (unsigned OF = 0; OF < OutputFactor; OF++)
13450 Mask.push_back(-1);
13451 continue;
13452 }
13453 // Set the Mask lanes adjusted for the size of the input and output
13454 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
13455 // output element, adjusted in their positions per input and output types.
13456 unsigned Lane = V.getConstantOperandVal(1);
13457 for (unsigned S = 0; S < Sources.size(); S++) {
13458 if (V.getOperand(0) == Sources[S].Vec) {
13459 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
13460 unsigned InputBase = 16 * S + Lane * InputSize / 8;
13461 for (unsigned OF = 0; OF < OutputFactor; OF++)
13462 Mask.push_back(InputBase + OF);
13463 break;
13464 }
13465 }
13466 }
13467
13468 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
13469 // v16i8, and the TBLMask
13470 SmallVector<SDValue, 16> TBLOperands;
13471 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
13472 ? Intrinsic::aarch64_neon_tbl3
13473 : Intrinsic::aarch64_neon_tbl4,
13474 DL, MVT::i32));
13475 for (unsigned i = 0; i < Sources.size(); i++) {
13476 SDValue Src = Sources[i].Vec;
13477 EVT SrcVT = Src.getValueType();
13478 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
13479 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
13480 "Expected a legally typed vector");
13481 if (SrcVT.is64BitVector())
13482 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Src,
13483 DAG.getUNDEF(MVT::v8i8));
13484 TBLOperands.push_back(Src);
13485 }
13486
13488 for (unsigned i = 0; i < Mask.size(); i++)
13489 TBLMask.push_back(DAG.getConstant(Mask[i], DL, MVT::i32));
13490 assert((Mask.size() == 8 || Mask.size() == 16) &&
13491 "Expected a v8i8 or v16i8 Mask");
13492 TBLOperands.push_back(DAG.getBuildVector(
13493 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, DL, TBLMask));
13494
13495 SDValue Shuffle =
13497 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
13498 return DAG.getBitcast(VT, Shuffle);
13499 }
13500
13501 if (Sources.size() > 2) {
13502 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
13503 << "sensible when at most two source vectors are "
13504 << "involved\n");
13505 return SDValue();
13506 }
13507
13508 // Find out the smallest element size among result and two sources, and use
13509 // it as element size to build the shuffle_vector.
13510 EVT SmallestEltTy = VT.getVectorElementType();
13511 for (auto &Source : Sources) {
13512 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
13513 if (SrcEltTy.bitsLT(SmallestEltTy)) {
13514 SmallestEltTy = SrcEltTy;
13515 }
13516 }
13517 unsigned ResMultiplier =
13518 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13519 uint64_t VTSize = VT.getFixedSizeInBits();
13520 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
13521 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
13522
13523 // If the source vector is too wide or too narrow, we may nevertheless be able
13524 // to construct a compatible shuffle either by concatenating it with UNDEF or
13525 // extracting a suitable range of elements.
13526 for (auto &Src : Sources) {
13527 EVT SrcVT = Src.ShuffleVec.getValueType();
13528
13529 TypeSize SrcVTSize = SrcVT.getSizeInBits();
13530 if (SrcVTSize == TypeSize::getFixed(VTSize))
13531 continue;
13532
13533 // This stage of the search produces a source with the same element type as
13534 // the original, but with a total width matching the BUILD_VECTOR output.
13535 EVT EltVT = SrcVT.getVectorElementType();
13536 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
13537 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
13538
13539 if (SrcVTSize.getFixedValue() < VTSize) {
13540 assert(2 * SrcVTSize == VTSize);
13541 // We can pad out the smaller vector for free, so if it's part of a
13542 // shuffle...
13543 Src.ShuffleVec =
13544 DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Src.ShuffleVec,
13545 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
13546 continue;
13547 }
13548
13549 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
13550 LLVM_DEBUG(
13551 dbgs() << "Reshuffle failed: result vector too small to extract\n");
13552 return SDValue();
13553 }
13554
13555 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
13556 LLVM_DEBUG(
13557 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
13558 return SDValue();
13559 }
13560
13561 if (Src.MinElt >= NumSrcElts) {
13562 // The extraction can just take the second half
13563 Src.ShuffleVec =
13564 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13565 DAG.getConstant(NumSrcElts, DL, MVT::i64));
13566 Src.WindowBase = -NumSrcElts;
13567 } else if (Src.MaxElt < NumSrcElts) {
13568 // The extraction can just take the first half
13569 Src.ShuffleVec =
13570 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13571 DAG.getConstant(0, DL, MVT::i64));
13572 } else {
13573 // An actual VEXT is needed
13574 SDValue VEXTSrc1 =
13575 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13576 DAG.getConstant(0, DL, MVT::i64));
13577 SDValue VEXTSrc2 =
13578 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13579 DAG.getConstant(NumSrcElts, DL, MVT::i64));
13580 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
13581
13582 if (!SrcVT.is64BitVector()) {
13583 LLVM_DEBUG(
13584 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
13585 "for SVE vectors.");
13586 return SDValue();
13587 }
13588
13589 Src.ShuffleVec =
13590 DAG.getNode(AArch64ISD::EXT, DL, DestVT, VEXTSrc1, VEXTSrc2,
13591 DAG.getConstant(Imm, DL, MVT::i32));
13592 Src.WindowBase = -Src.MinElt;
13593 }
13594 }
13595
13596 // Another possible incompatibility occurs from the vector element types. We
13597 // can fix this by bitcasting the source vectors to the same type we intend
13598 // for the shuffle.
13599 for (auto &Src : Sources) {
13600 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
13601 if (SrcEltTy == SmallestEltTy)
13602 continue;
13603 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
13604 if (DAG.getDataLayout().isBigEndian()) {
13605 Src.ShuffleVec =
13606 DAG.getNode(AArch64ISD::NVCAST, DL, ShuffleVT, Src.ShuffleVec);
13607 } else {
13608 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Src.ShuffleVec);
13609 }
13610 Src.WindowScale =
13611 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13612 Src.WindowBase *= Src.WindowScale;
13613 }
13614
13615 // Final check before we try to actually produce a shuffle.
13616 LLVM_DEBUG({
13617 for (auto Src : Sources)
13618 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
13619 });
13620
13621 // The stars all align, our next step is to produce the mask for the shuffle.
13622 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
13623 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
13624 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
13625 SDValue Entry = Op.getOperand(i);
13626 if (Entry.isUndef())
13627 continue;
13628
13629 auto Src = find(Sources, Entry.getOperand(0));
13630 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
13631
13632 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
13633 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
13634 // segment.
13635 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
13636 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
13637 VT.getScalarSizeInBits());
13638 int LanesDefined = BitsDefined / BitsPerShuffleLane;
13639
13640 // This source is expected to fill ResMultiplier lanes of the final shuffle,
13641 // starting at the appropriate offset.
13642 int *LaneMask = &Mask[i * ResMultiplier];
13643
13644 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
13645 ExtractBase += NumElts * (Src - Sources.begin());
13646 for (int j = 0; j < LanesDefined; ++j)
13647 LaneMask[j] = ExtractBase + j;
13648 }
13649
13650 // Final check before we try to produce nonsense...
13651 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
13652 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
13653 return SDValue();
13654 }
13655
13656 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
13657 for (unsigned i = 0; i < Sources.size(); ++i)
13658 ShuffleOps[i] = Sources[i].ShuffleVec;
13659
13660 SDValue Shuffle =
13661 DAG.getVectorShuffle(ShuffleVT, DL, ShuffleOps[0], ShuffleOps[1], Mask);
13662 SDValue V;
13663 if (DAG.getDataLayout().isBigEndian()) {
13664 V = DAG.getNode(AArch64ISD::NVCAST, DL, VT, Shuffle);
13665 } else {
13666 V = DAG.getNode(ISD::BITCAST, DL, VT, Shuffle);
13667 }
13668
13669 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
13670 dbgs() << "Reshuffle, creating node: "; V.dump(););
13671
13672 return V;
13673}
13674
13675// check if an EXT instruction can handle the shuffle mask when the
13676// vector sources of the shuffle are the same.
13677static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
13678 unsigned NumElts = VT.getVectorNumElements();
13679
13680 // Assume that the first shuffle index is not UNDEF. Fail if it is.
13681 if (M[0] < 0)
13682 return false;
13683
13684 Imm = M[0];
13685
13686 // If this is a VEXT shuffle, the immediate value is the index of the first
13687 // element. The other shuffle indices must be the successive elements after
13688 // the first one.
13689 unsigned ExpectedElt = Imm;
13690 for (unsigned i = 1; i < NumElts; ++i) {
13691 // Increment the expected index. If it wraps around, just follow it
13692 // back to index zero and keep going.
13693 ++ExpectedElt;
13694 if (ExpectedElt == NumElts)
13695 ExpectedElt = 0;
13696
13697 if (M[i] < 0)
13698 continue; // ignore UNDEF indices
13699 if (ExpectedElt != static_cast<unsigned>(M[i]))
13700 return false;
13701 }
13702
13703 return true;
13704}
13705
13706// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13707// v4i32s. This is really a truncate, which we can construct out of (legal)
13708// concats and truncate nodes.
13710 if (V.getValueType() != MVT::v16i8)
13711 return SDValue();
13712 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
13713
13714 for (unsigned X = 0; X < 4; X++) {
13715 // Check the first item in each group is an extract from lane 0 of a v4i32
13716 // or v4i16.
13717 SDValue BaseExt = V.getOperand(X * 4);
13718 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13719 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
13720 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
13721 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
13722 BaseExt.getConstantOperandVal(1) != 0)
13723 return SDValue();
13724 SDValue Base = BaseExt.getOperand(0);
13725 // And check the other items are extracts from the same vector.
13726 for (unsigned Y = 1; Y < 4; Y++) {
13727 SDValue Ext = V.getOperand(X * 4 + Y);
13728 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13729 Ext.getOperand(0) != Base ||
13730 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
13731 Ext.getConstantOperandVal(1) != Y)
13732 return SDValue();
13733 }
13734 }
13735
13736 // Turn the buildvector into a series of truncates and concates, which will
13737 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
13738 // concat together to produce 2 v8i16. These are both truncated and concat
13739 // together.
13740 SDLoc DL(V);
13741 SDValue Trunc[4] = {
13742 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
13743 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
13744 for (SDValue &V : Trunc)
13745 if (V.getValueType() == MVT::v4i32)
13746 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
13747 SDValue Concat0 =
13748 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
13749 SDValue Concat1 =
13750 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
13751 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
13752 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
13753 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
13754}
13755
13756/// Check if a vector shuffle corresponds to a DUP instructions with a larger
13757/// element width than the vector lane type. If that is the case the function
13758/// returns true and writes the value of the DUP instruction lane operand into
13759/// DupLaneOp
13760static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
13761 unsigned &DupLaneOp) {
13762 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
13763 "Only possible block sizes for wide DUP are: 16, 32, 64");
13764
13765 if (BlockSize <= VT.getScalarSizeInBits())
13766 return false;
13767 if (BlockSize % VT.getScalarSizeInBits() != 0)
13768 return false;
13769 if (VT.getSizeInBits() % BlockSize != 0)
13770 return false;
13771
13772 size_t SingleVecNumElements = VT.getVectorNumElements();
13773 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
13774 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
13775
13776 // We are looking for masks like
13777 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
13778 // might be replaced by 'undefined'. BlockIndices will eventually contain
13779 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
13780 // for the above examples)
13781 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
13782 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
13783 for (size_t I = 0; I < NumEltsPerBlock; I++) {
13784 int Elt = M[BlockIndex * NumEltsPerBlock + I];
13785 if (Elt < 0)
13786 continue;
13787 // For now we don't support shuffles that use the second operand
13788 if ((unsigned)Elt >= SingleVecNumElements)
13789 return false;
13790 if (BlockElts[I] < 0)
13791 BlockElts[I] = Elt;
13792 else if (BlockElts[I] != Elt)
13793 return false;
13794 }
13795
13796 // We found a candidate block (possibly with some undefs). It must be a
13797 // sequence of consecutive integers starting with a value divisible by
13798 // NumEltsPerBlock with some values possibly replaced by undef-s.
13799
13800 // Find first non-undef element
13801 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
13802 assert(FirstRealEltIter != BlockElts.end() &&
13803 "Shuffle with all-undefs must have been caught by previous cases, "
13804 "e.g. isSplat()");
13805 if (FirstRealEltIter == BlockElts.end()) {
13806 DupLaneOp = 0;
13807 return true;
13808 }
13809
13810 // Index of FirstRealElt in BlockElts
13811 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
13812
13813 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
13814 return false;
13815 // BlockElts[0] must have the following value if it isn't undef:
13816 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
13817
13818 // Check the first element
13819 if (Elt0 % NumEltsPerBlock != 0)
13820 return false;
13821 // Check that the sequence indeed consists of consecutive integers (modulo
13822 // undefs)
13823 for (size_t I = 0; I < NumEltsPerBlock; I++)
13824 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
13825 return false;
13826
13827 DupLaneOp = Elt0 / NumEltsPerBlock;
13828 return true;
13829}
13830
13831// check if an EXT instruction can handle the shuffle mask when the
13832// vector sources of the shuffle are different.
13833static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
13834 unsigned &Imm) {
13835 // Look for the first non-undef element.
13836 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
13837
13838 // Benefit from APInt to handle overflow when calculating expected element.
13839 unsigned NumElts = VT.getVectorNumElements();
13840 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
13841 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
13842 /*implicitTrunc=*/true);
13843 // The following shuffle indices must be the successive elements after the
13844 // first real element.
13845 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
13846 return Elt != ExpectedElt++ && Elt >= 0;
13847 });
13848 if (FoundWrongElt)
13849 return false;
13850
13851 // The index of an EXT is the first element if it is not UNDEF.
13852 // Watch out for the beginning UNDEFs. The EXT index should be the expected
13853 // value of the first element. E.g.
13854 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
13855 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
13856 // ExpectedElt is the last mask index plus 1.
13857 Imm = ExpectedElt.getZExtValue();
13858
13859 // There are two difference cases requiring to reverse input vectors.
13860 // For example, for vector <4 x i32> we have the following cases,
13861 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
13862 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
13863 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
13864 // to reverse two input vectors.
13865 if (Imm < NumElts)
13866 ReverseEXT = true;
13867 else
13868 Imm -= NumElts;
13869
13870 return true;
13871}
13872
13873/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
13874/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13875/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
13876static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13877 unsigned NumElts = VT.getVectorNumElements();
13878 if (NumElts % 2 != 0)
13879 return false;
13880 WhichResult = (M[0] == 0 ? 0 : 1);
13881 unsigned Idx = WhichResult * NumElts / 2;
13882 for (unsigned i = 0; i != NumElts; i += 2) {
13883 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
13884 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
13885 return false;
13886 Idx += 1;
13887 }
13888
13889 return true;
13890}
13891
13892/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
13893/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13894/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
13895static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13896 unsigned Half = VT.getVectorNumElements() / 2;
13897 WhichResult = (M[0] == 0 ? 0 : 1);
13898 for (unsigned j = 0; j != 2; ++j) {
13899 unsigned Idx = WhichResult;
13900 for (unsigned i = 0; i != Half; ++i) {
13901 int MIdx = M[i + j * Half];
13902 if (MIdx >= 0 && (unsigned)MIdx != Idx)
13903 return false;
13904 Idx += 2;
13905 }
13906 }
13907
13908 return true;
13909}
13910
13911/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
13912/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13913/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
13914static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13915 unsigned NumElts = VT.getVectorNumElements();
13916 if (NumElts % 2 != 0)
13917 return false;
13918 WhichResult = (M[0] == 0 ? 0 : 1);
13919 for (unsigned i = 0; i < NumElts; i += 2) {
13920 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
13921 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
13922 return false;
13923 }
13924 return true;
13925}
13926
13927static bool isINSMask(ArrayRef<int> M, int NumInputElements,
13928 bool &DstIsLeft, int &Anomaly) {
13929 if (M.size() != static_cast<size_t>(NumInputElements))
13930 return false;
13931
13932 int NumLHSMatch = 0, NumRHSMatch = 0;
13933 int LastLHSMismatch = -1, LastRHSMismatch = -1;
13934
13935 for (int i = 0; i < NumInputElements; ++i) {
13936 if (M[i] == -1) {
13937 ++NumLHSMatch;
13938 ++NumRHSMatch;
13939 continue;
13940 }
13941
13942 if (M[i] == i)
13943 ++NumLHSMatch;
13944 else
13945 LastLHSMismatch = i;
13946
13947 if (M[i] == i + NumInputElements)
13948 ++NumRHSMatch;
13949 else
13950 LastRHSMismatch = i;
13951 }
13952
13953 if (NumLHSMatch == NumInputElements - 1) {
13954 DstIsLeft = true;
13955 Anomaly = LastLHSMismatch;
13956 return true;
13957 } else if (NumRHSMatch == NumInputElements - 1) {
13958 DstIsLeft = false;
13959 Anomaly = LastRHSMismatch;
13960 return true;
13961 }
13962
13963 return false;
13964}
13965
13966static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
13967 if (VT.getSizeInBits() != 128)
13968 return false;
13969
13970 unsigned NumElts = VT.getVectorNumElements();
13971
13972 for (int I = 0, E = NumElts / 2; I != E; I++) {
13973 if (Mask[I] != I)
13974 return false;
13975 }
13976
13977 int Offset = NumElts / 2;
13978 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
13979 if (Mask[I] != I + SplitLHS * Offset)
13980 return false;
13981 }
13982
13983 return true;
13984}
13985
13987 SDLoc DL(Op);
13988 EVT VT = Op.getValueType();
13989 SDValue V0 = Op.getOperand(0);
13990 SDValue V1 = Op.getOperand(1);
13991 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
13992
13995 return SDValue();
13996
13997 bool SplitV0 = V0.getValueSizeInBits() == 128;
13998
13999 if (!isConcatMask(Mask, VT, SplitV0))
14000 return SDValue();
14001
14002 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14003 if (SplitV0) {
14004 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
14005 DAG.getConstant(0, DL, MVT::i64));
14006 }
14007 if (V1.getValueSizeInBits() == 128) {
14008 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
14009 DAG.getConstant(0, DL, MVT::i64));
14010 }
14011 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
14012}
14013
14014/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
14015/// the specified operations to build the shuffle. ID is the perfect-shuffle
14016//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
14017//table entry and LHS/RHS are the immediate inputs for this stage of the
14018//shuffle.
14020 unsigned PFEntry, SDValue LHS,
14021 SDValue RHS, SelectionDAG &DAG,
14022 const SDLoc &DL) {
14023 unsigned OpNum = (PFEntry >> 26) & 0x0F;
14024 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
14025 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
14026
14027 enum {
14028 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
14029 OP_VREV,
14030 OP_VDUP0,
14031 OP_VDUP1,
14032 OP_VDUP2,
14033 OP_VDUP3,
14034 OP_VEXT1,
14035 OP_VEXT2,
14036 OP_VEXT3,
14037 OP_VUZPL, // VUZP, left result
14038 OP_VUZPR, // VUZP, right result
14039 OP_VZIPL, // VZIP, left result
14040 OP_VZIPR, // VZIP, right result
14041 OP_VTRNL, // VTRN, left result
14042 OP_VTRNR, // VTRN, right result
14043 OP_MOVLANE // Move lane. RHSID is the lane to move into
14044 };
14045
14046 if (OpNum == OP_COPY) {
14047 if (LHSID == (1 * 9 + 2) * 9 + 3)
14048 return LHS;
14049 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
14050 return RHS;
14051 }
14052
14053 if (OpNum == OP_MOVLANE) {
14054 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
14055 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
14056 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
14057 Elt = 3 - Elt;
14058 while (Elt > 0) {
14059 ID /= 9;
14060 Elt--;
14061 }
14062 return (ID % 9 == 8) ? -1 : ID % 9;
14063 };
14064
14065 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
14066 // get the lane to move from the PFID, which is always from the
14067 // original vectors (V1 or V2).
14069 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, DL);
14070 EVT VT = OpLHS.getValueType();
14071 assert(RHSID < 8 && "Expected a lane index for RHSID!");
14072 unsigned ExtLane = 0;
14073 SDValue Input;
14074
14075 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
14076 // convert into a higher type.
14077 if (RHSID & 0x4) {
14078 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
14079 if (MaskElt == -1)
14080 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
14081 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14082 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
14083 Input = MaskElt < 2 ? V1 : V2;
14084 if (VT.getScalarSizeInBits() == 16) {
14085 Input = DAG.getBitcast(MVT::v2f32, Input);
14086 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
14087 } else {
14088 assert(VT.getScalarSizeInBits() == 32 &&
14089 "Expected 16 or 32 bit shuffle elements");
14090 Input = DAG.getBitcast(MVT::v2f64, Input);
14091 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
14092 }
14093 } else {
14094 int MaskElt = getPFIDLane(ID, RHSID);
14095 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14096 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
14097 Input = MaskElt < 4 ? V1 : V2;
14098 // Be careful about creating illegal types. Use f16 instead of i16.
14099 if (VT == MVT::v4i16) {
14100 Input = DAG.getBitcast(MVT::v4f16, Input);
14101 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
14102 }
14103 }
14105 Input.getValueType().getVectorElementType(),
14106 Input, DAG.getVectorIdxConstant(ExtLane, DL));
14107 SDValue Ins =
14108 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Input.getValueType(), OpLHS,
14109 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, DL));
14110 return DAG.getBitcast(VT, Ins);
14111 }
14112
14113 SDValue OpLHS, OpRHS;
14114 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
14115 RHS, DAG, DL);
14116 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
14117 RHS, DAG, DL);
14118 EVT VT = OpLHS.getValueType();
14119
14120 switch (OpNum) {
14121 default:
14122 llvm_unreachable("Unknown shuffle opcode!");
14123 case OP_VREV:
14124 // VREV divides the vector in half and swaps within the half.
14125 if (VT.getVectorElementType() == MVT::i32 ||
14126 VT.getVectorElementType() == MVT::f32)
14127 return DAG.getNode(AArch64ISD::REV64, DL, VT, OpLHS);
14128 // vrev <4 x i16> -> REV32
14129 if (VT.getVectorElementType() == MVT::i16 ||
14130 VT.getVectorElementType() == MVT::f16 ||
14131 VT.getVectorElementType() == MVT::bf16)
14132 return DAG.getNode(AArch64ISD::REV32, DL, VT, OpLHS);
14133 // vrev <4 x i8> -> REV16
14134 assert(VT.getVectorElementType() == MVT::i8);
14135 return DAG.getNode(AArch64ISD::REV16, DL, VT, OpLHS);
14136 case OP_VDUP0:
14137 case OP_VDUP1:
14138 case OP_VDUP2:
14139 case OP_VDUP3: {
14140 EVT EltTy = VT.getVectorElementType();
14141 unsigned Opcode;
14142 if (EltTy == MVT::i8)
14143 Opcode = AArch64ISD::DUPLANE8;
14144 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
14145 Opcode = AArch64ISD::DUPLANE16;
14146 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
14147 Opcode = AArch64ISD::DUPLANE32;
14148 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
14149 Opcode = AArch64ISD::DUPLANE64;
14150 else
14151 llvm_unreachable("Invalid vector element type?");
14152
14153 if (VT.getSizeInBits() == 64)
14154 OpLHS = WidenVector(OpLHS, DAG);
14155 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, DL, MVT::i64);
14156 return DAG.getNode(Opcode, DL, VT, OpLHS, Lane);
14157 }
14158 case OP_VEXT1:
14159 case OP_VEXT2:
14160 case OP_VEXT3: {
14161 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
14162 return DAG.getNode(AArch64ISD::EXT, DL, VT, OpLHS, OpRHS,
14163 DAG.getConstant(Imm, DL, MVT::i32));
14164 }
14165 case OP_VUZPL:
14166 return DAG.getNode(AArch64ISD::UZP1, DL, VT, OpLHS, OpRHS);
14167 case OP_VUZPR:
14168 return DAG.getNode(AArch64ISD::UZP2, DL, VT, OpLHS, OpRHS);
14169 case OP_VZIPL:
14170 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, OpLHS, OpRHS);
14171 case OP_VZIPR:
14172 return DAG.getNode(AArch64ISD::ZIP2, DL, VT, OpLHS, OpRHS);
14173 case OP_VTRNL:
14174 return DAG.getNode(AArch64ISD::TRN1, DL, VT, OpLHS, OpRHS);
14175 case OP_VTRNR:
14176 return DAG.getNode(AArch64ISD::TRN2, DL, VT, OpLHS, OpRHS);
14177 }
14178}
14179
14181 SelectionDAG &DAG) {
14182 // Check to see if we can use the TBL instruction.
14183 SDValue V1 = Op.getOperand(0);
14184 SDValue V2 = Op.getOperand(1);
14185 SDLoc DL(Op);
14186
14187 EVT EltVT = Op.getValueType().getVectorElementType();
14188 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
14189
14190 bool Swap = false;
14191 if (V1.isUndef() || isZerosVector(V1.getNode())) {
14192 std::swap(V1, V2);
14193 Swap = true;
14194 }
14195
14196 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
14197 // out of range values with 0s. We do need to make sure that any out-of-range
14198 // values are really out-of-range for a v16i8 vector.
14199 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
14200 MVT IndexVT = MVT::v8i8;
14201 unsigned IndexLen = 8;
14202 if (Op.getValueSizeInBits() == 128) {
14203 IndexVT = MVT::v16i8;
14204 IndexLen = 16;
14205 }
14206
14208 for (int Val : ShuffleMask) {
14209 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
14210 unsigned Offset = Byte + Val * BytesPerElt;
14211 if (Swap)
14212 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
14213 if (IsUndefOrZero && Offset >= IndexLen)
14214 Offset = 255;
14215 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
14216 }
14217 }
14218
14219 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
14220 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
14221
14222 SDValue Shuffle;
14223 if (IsUndefOrZero) {
14224 if (IndexLen == 8)
14225 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
14226 Shuffle = DAG.getNode(
14227 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14228 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
14229 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14230 } else {
14231 if (IndexLen == 8) {
14232 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
14233 Shuffle = DAG.getNode(
14234 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14235 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
14236 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14237 } else {
14238 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
14239 // cannot currently represent the register constraints on the input
14240 // table registers.
14241 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
14242 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
14243 // IndexLen));
14244 Shuffle = DAG.getNode(
14245 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14246 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
14247 V2Cst,
14248 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14249 }
14250 }
14251 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
14252}
14253
14254static unsigned getDUPLANEOp(EVT EltType) {
14255 if (EltType == MVT::i8)
14256 return AArch64ISD::DUPLANE8;
14257 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
14258 return AArch64ISD::DUPLANE16;
14259 if (EltType == MVT::i32 || EltType == MVT::f32)
14260 return AArch64ISD::DUPLANE32;
14261 if (EltType == MVT::i64 || EltType == MVT::f64)
14262 return AArch64ISD::DUPLANE64;
14263
14264 llvm_unreachable("Invalid vector element type?");
14265}
14266
14267static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT,
14268 unsigned Opcode, SelectionDAG &DAG) {
14269 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
14270 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
14271 // Match: dup (bitcast (extract_subv X, C)), LaneC
14272 if (BitCast.getOpcode() != ISD::BITCAST ||
14274 return false;
14275
14276 // The extract index must align in the destination type. That may not
14277 // happen if the bitcast is from narrow to wide type.
14278 SDValue Extract = BitCast.getOperand(0);
14279 unsigned ExtIdx = Extract.getConstantOperandVal(1);
14280 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
14281 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
14282 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
14283 if (ExtIdxInBits % CastedEltBitWidth != 0)
14284 return false;
14285
14286 // Can't handle cases where vector size is not 128-bit
14287 if (!Extract.getOperand(0).getValueType().is128BitVector())
14288 return false;
14289
14290 // Update the lane value by offsetting with the scaled extract index.
14291 LaneC += ExtIdxInBits / CastedEltBitWidth;
14292
14293 // Determine the casted vector type of the wide vector input.
14294 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
14295 // Examples:
14296 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
14297 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
14298 unsigned SrcVecNumElts =
14299 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
14301 SrcVecNumElts);
14302 return true;
14303 };
14304 MVT CastVT;
14305 if (getScaledOffsetDup(V, Lane, CastVT)) {
14306 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
14307 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
14308 V.getOperand(0).getValueType().is128BitVector()) {
14309 // The lane is incremented by the index of the extract.
14310 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
14311 Lane += V.getConstantOperandVal(1);
14312 V = V.getOperand(0);
14313 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
14314 // The lane is decremented if we are splatting from the 2nd operand.
14315 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
14316 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
14317 Lane -= Idx * VT.getVectorNumElements() / 2;
14318 V = WidenVector(V.getOperand(Idx), DAG);
14319 } else if (VT.getSizeInBits() == 64) {
14320 // Widen the operand to 128-bit register with undef.
14321 V = WidenVector(V, DAG);
14322 }
14323 return DAG.getNode(Opcode, DL, VT, V, DAG.getConstant(Lane, DL, MVT::i64));
14324}
14325
14326// Try to widen element type to get a new mask value for a better permutation
14327// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
14328// UZP1/2, TRN1/2, REV, INS, etc.
14329// For example:
14330// shufflevector <4 x i32> %a, <4 x i32> %b,
14331// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
14332// is equivalent to:
14333// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
14334// Finally, we can get:
14335// mov v0.d[0], v1.d[1]
14337 SDLoc DL(Op);
14338 EVT VT = Op.getValueType();
14339 EVT ScalarVT = VT.getVectorElementType();
14340 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
14341 SDValue V0 = Op.getOperand(0);
14342 SDValue V1 = Op.getOperand(1);
14343 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14344
14345 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
14346 // We need to make sure the wider element type is legal. Thus, ElementSize
14347 // should be not larger than 32 bits, and i1 type should also be excluded.
14348 if (ElementSize > 32 || ElementSize == 1)
14349 return SDValue();
14350
14351 SmallVector<int, 8> NewMask;
14352 if (widenShuffleMaskElts(Mask, NewMask)) {
14353 MVT NewEltVT = VT.isFloatingPoint()
14354 ? MVT::getFloatingPointVT(ElementSize * 2)
14355 : MVT::getIntegerVT(ElementSize * 2);
14356 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14357 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14358 V0 = DAG.getBitcast(NewVT, V0);
14359 V1 = DAG.getBitcast(NewVT, V1);
14360 return DAG.getBitcast(VT,
14361 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
14362 }
14363 }
14364
14365 return SDValue();
14366}
14367
14368// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
14370 ArrayRef<int> ShuffleMask,
14371 SelectionDAG &DAG) {
14372 SDValue Tbl1 = Op->getOperand(0);
14373 SDValue Tbl2 = Op->getOperand(1);
14374 SDLoc DL(Op);
14375 SDValue Tbl2ID =
14376 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i64);
14377
14378 EVT VT = Op.getValueType();
14379 if (Tbl1.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
14380 Tbl1.getOperand(0) != Tbl2ID ||
14382 Tbl2.getOperand(0) != Tbl2ID)
14383 return SDValue();
14384
14385 if (Tbl1.getValueType() != MVT::v16i8 || Tbl2.getValueType() != MVT::v16i8)
14386 return SDValue();
14387
14388 SDValue Mask1 = Tbl1.getOperand(3);
14389 SDValue Mask2 = Tbl2.getOperand(3);
14390 if (Mask1.getOpcode() != ISD::BUILD_VECTOR ||
14391 Mask2.getOpcode() != ISD::BUILD_VECTOR)
14392 return SDValue();
14393
14394 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
14395 for (unsigned I = 0; I < 16; I++) {
14396 if (ShuffleMask[I] < 16)
14397 TBLMaskParts[I] = Mask1.getOperand(ShuffleMask[I]);
14398 else {
14399 auto *C = dyn_cast<ConstantSDNode>(Mask2.getOperand(ShuffleMask[I] - 16));
14400 if (!C)
14401 return SDValue();
14402 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, DL, MVT::i32);
14403 }
14404 }
14405
14406 SDValue TBLMask = DAG.getBuildVector(VT, DL, TBLMaskParts);
14407 SDValue ID =
14408 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, DL, MVT::i64);
14409
14410 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v16i8,
14411 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
14412 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
14413}
14414
14415// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
14416// but we don't have an appropriate instruction,
14417// so custom-lower it as ZIP1-with-zeros.
14418SDValue
14419AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
14420 SelectionDAG &DAG) const {
14421 SDLoc DL(Op);
14422 EVT VT = Op.getValueType();
14423 SDValue SrcOp = Op.getOperand(0);
14424 EVT SrcVT = SrcOp.getValueType();
14425 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
14426 "Unexpected extension factor.");
14427 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
14428 // FIXME: support multi-step zipping?
14429 if (Scale != 2)
14430 return SDValue();
14431 SDValue Zeros = DAG.getConstant(0, DL, SrcVT);
14432 return DAG.getBitcast(VT,
14433 DAG.getNode(AArch64ISD::ZIP1, DL, SrcVT, SrcOp, Zeros));
14434}
14435
14436SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
14437 SelectionDAG &DAG) const {
14438 SDLoc DL(Op);
14439 EVT VT = Op.getValueType();
14440
14441 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
14442
14443 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14444 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
14445
14446 // Convert shuffles that are directly supported on NEON to target-specific
14447 // DAG nodes, instead of keeping them as shuffles and matching them again
14448 // during code selection. This is more efficient and avoids the possibility
14449 // of inconsistencies between legalization and selection.
14450 ArrayRef<int> ShuffleMask = SVN->getMask();
14451
14452 SDValue V1 = Op.getOperand(0);
14453 SDValue V2 = Op.getOperand(1);
14454
14455 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
14456 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
14457 "Unexpected VECTOR_SHUFFLE mask size!");
14458
14459 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
14460 return Res;
14461
14462 if (SVN->isSplat()) {
14463 int Lane = SVN->getSplatIndex();
14464 // If this is undef splat, generate it via "just" vdup, if possible.
14465 if (Lane == -1)
14466 Lane = 0;
14467
14468 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
14469 return DAG.getNode(AArch64ISD::DUP, DL, V1.getValueType(),
14470 V1.getOperand(0));
14471 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
14472 // constant. If so, we can just reference the lane's definition directly.
14473 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
14475 return DAG.getNode(AArch64ISD::DUP, DL, VT, V1.getOperand(Lane));
14476
14477 // Otherwise, duplicate from the lane of the input vector.
14478 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
14479 return constructDup(V1, Lane, DL, VT, Opcode, DAG);
14480 }
14481
14482 // Check if the mask matches a DUP for a wider element
14483 for (unsigned LaneSize : {64U, 32U, 16U}) {
14484 unsigned Lane = 0;
14485 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
14486 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
14487 : LaneSize == 32 ? AArch64ISD::DUPLANE32
14488 : AArch64ISD::DUPLANE16;
14489 // Cast V1 to an integer vector with required lane size
14490 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
14491 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
14492 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
14493 V1 = DAG.getBitcast(NewVecTy, V1);
14494 // Construct the DUP instruction
14495 V1 = constructDup(V1, Lane, DL, NewVecTy, Opcode, DAG);
14496 // Cast back to the original type
14497 return DAG.getBitcast(VT, V1);
14498 }
14499 }
14500
14501 unsigned NumElts = VT.getVectorNumElements();
14502 unsigned EltSize = VT.getScalarSizeInBits();
14503 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
14504 return DAG.getNode(AArch64ISD::REV64, DL, V1.getValueType(), V1);
14505 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
14506 return DAG.getNode(AArch64ISD::REV32, DL, V1.getValueType(), V1);
14507 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
14508 return DAG.getNode(AArch64ISD::REV16, DL, V1.getValueType(), V1);
14509
14510 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
14511 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
14512 SDValue Rev = DAG.getNode(AArch64ISD::REV64, DL, VT, V1);
14513 return DAG.getNode(AArch64ISD::EXT, DL, VT, Rev, Rev,
14514 DAG.getConstant(8, DL, MVT::i32));
14515 }
14516
14517 bool ReverseEXT = false;
14518 unsigned Imm;
14519 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
14520 if (ReverseEXT)
14521 std::swap(V1, V2);
14522 Imm *= getExtFactor(V1);
14523 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V2,
14524 DAG.getConstant(Imm, DL, MVT::i32));
14525 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
14526 Imm *= getExtFactor(V1);
14527 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V1,
14528 DAG.getConstant(Imm, DL, MVT::i32));
14529 }
14530
14531 unsigned WhichResult;
14532 if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
14533 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14534 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14535 }
14536 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
14537 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14538 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14539 }
14540 if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
14541 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14542 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14543 }
14544
14545 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14546 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14547 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14548 }
14549 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14550 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14551 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14552 }
14553 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14554 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14555 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14556 }
14557
14559 return Concat;
14560
14561 bool DstIsLeft;
14562 int Anomaly;
14563 int NumInputElements = V1.getValueType().getVectorNumElements();
14564 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
14565 SDValue DstVec = DstIsLeft ? V1 : V2;
14566 SDValue DstLaneV = DAG.getConstant(Anomaly, DL, MVT::i64);
14567
14568 SDValue SrcVec = V1;
14569 int SrcLane = ShuffleMask[Anomaly];
14570 if (SrcLane >= NumInputElements) {
14571 SrcVec = V2;
14572 SrcLane -= NumElts;
14573 }
14574 SDValue SrcLaneV = DAG.getConstant(SrcLane, DL, MVT::i64);
14575
14576 EVT ScalarVT = VT.getVectorElementType();
14577
14578 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
14579 ScalarVT = MVT::i32;
14580
14581 return DAG.getNode(
14582 ISD::INSERT_VECTOR_ELT, DL, VT, DstVec,
14583 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SrcVec, SrcLaneV),
14584 DstLaneV);
14585 }
14586
14587 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
14588 return NewSD;
14589
14590 // If the shuffle is not directly supported and it has 4 elements, use
14591 // the PerfectShuffle-generated table to synthesize it from other shuffles.
14592 if (NumElts == 4) {
14593 unsigned PFIndexes[4];
14594 for (unsigned i = 0; i != 4; ++i) {
14595 if (ShuffleMask[i] < 0)
14596 PFIndexes[i] = 8;
14597 else
14598 PFIndexes[i] = ShuffleMask[i];
14599 }
14600
14601 // Compute the index in the perfect shuffle table.
14602 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
14603 PFIndexes[2] * 9 + PFIndexes[3];
14604 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
14605 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
14606 DL);
14607 }
14608
14609 // Check for a "select shuffle", generating a BSL to pick between lanes in
14610 // V1/V2.
14611 if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
14612 assert(VT.getScalarSizeInBits() <= 32 &&
14613 "Expected larger vector element sizes to be handled already");
14614 SmallVector<SDValue> MaskElts;
14615 for (int M : ShuffleMask)
14616 MaskElts.push_back(DAG.getConstant(
14617 M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, DL, MVT::i32));
14618 EVT IVT = VT.changeVectorElementTypeToInteger();
14619 SDValue MaskConst = DAG.getBuildVector(IVT, DL, MaskElts);
14620 return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, DL, IVT, MaskConst,
14621 DAG.getBitcast(IVT, V1),
14622 DAG.getBitcast(IVT, V2)));
14623 }
14624
14625 // Fall back to generating a TBL
14626 return GenerateTBL(Op, ShuffleMask, DAG);
14627}
14628
14629SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
14630 SelectionDAG &DAG) const {
14631 EVT VT = Op.getValueType();
14632
14633 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14634 return LowerToScalableOp(Op, DAG);
14635
14636 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
14637 "Unexpected vector type!");
14638
14639 // We can handle the constant cases during isel.
14640 if (isa<ConstantSDNode>(Op.getOperand(0)))
14641 return Op;
14642
14643 // There isn't a natural way to handle the general i1 case, so we use some
14644 // trickery with whilelo.
14645 SDLoc DL(Op);
14646 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
14647 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
14648 DAG.getValueType(MVT::i1));
14649 SDValue ID =
14650 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
14651 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14652 if (VT == MVT::nxv1i1)
14653 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
14654 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
14655 Zero, SplatVal),
14656 Zero);
14657 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
14658}
14659
14660SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
14661 SelectionDAG &DAG) const {
14662 SDLoc DL(Op);
14663
14664 EVT VT = Op.getValueType();
14665 if (!isTypeLegal(VT) || !VT.isScalableVector())
14666 return SDValue();
14667
14668 // Current lowering only supports the SVE-ACLE types.
14670 return SDValue();
14671
14672 // The DUPQ operation is independent of element type so normalise to i64s.
14673 SDValue Idx128 = Op.getOperand(2);
14674
14675 // DUPQ can be used when idx is in range.
14676 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
14677 if (CIdx && (CIdx->getZExtValue() <= 3)) {
14678 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
14679 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
14680 }
14681
14682 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
14683
14684 // The ACLE says this must produce the same result as:
14685 // svtbl(data, svadd_x(svptrue_b64(),
14686 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
14687 // index * 2))
14688 SDValue One = DAG.getConstant(1, DL, MVT::i64);
14689 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
14690
14691 // create the vector 0,1,0,1,...
14692 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
14693 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
14694
14695 // create the vector idx64,idx64+1,idx64,idx64+1,...
14696 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
14697 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
14698 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
14699
14700 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
14701 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
14702 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
14703}
14704
14705
14706static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
14707 APInt &UndefBits) {
14708 EVT VT = BVN->getValueType(0);
14709 APInt SplatBits, SplatUndef;
14710 unsigned SplatBitSize;
14711 bool HasAnyUndefs;
14712 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14713 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
14714
14715 for (unsigned i = 0; i < NumSplats; ++i) {
14716 CnstBits <<= SplatBitSize;
14717 UndefBits <<= SplatBitSize;
14718 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
14719 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
14720 }
14721
14722 return true;
14723 }
14724
14725 return false;
14726}
14727
14728// Try 64-bit splatted SIMD immediate.
14729static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14730 const APInt &Bits) {
14731 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14732 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14733 EVT VT = Op.getValueType();
14734 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
14735
14738
14739 SDLoc DL(Op);
14740 SDValue Mov =
14741 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14742 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14743 }
14744 }
14745
14746 return SDValue();
14747}
14748
14749// Try 32-bit splatted SIMD immediate.
14750static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14751 const APInt &Bits,
14752 const SDValue *LHS = nullptr) {
14753 EVT VT = Op.getValueType();
14754 if (VT.isFixedLengthVector() &&
14756 return SDValue();
14757
14758 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14759 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14760 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14761 bool isAdvSIMDModImm = false;
14762 uint64_t Shift;
14763
14764 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
14766 Shift = 0;
14767 }
14768 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
14770 Shift = 8;
14771 }
14772 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
14774 Shift = 16;
14775 }
14776 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
14778 Shift = 24;
14779 }
14780
14781 if (isAdvSIMDModImm) {
14782 SDLoc DL(Op);
14783 SDValue Mov;
14784
14785 if (LHS)
14786 Mov = DAG.getNode(NewOp, DL, MovTy,
14787 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
14788 DAG.getConstant(Value, DL, MVT::i32),
14789 DAG.getConstant(Shift, DL, MVT::i32));
14790 else
14791 Mov =
14792 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14793 DAG.getConstant(Shift, DL, MVT::i32));
14794
14795 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14796 }
14797 }
14798
14799 return SDValue();
14800}
14801
14802// Try 16-bit splatted SIMD immediate.
14803static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14804 const APInt &Bits,
14805 const SDValue *LHS = nullptr) {
14806 EVT VT = Op.getValueType();
14807 if (VT.isFixedLengthVector() &&
14809 return SDValue();
14810
14811 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14812 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14813 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
14814 bool isAdvSIMDModImm = false;
14815 uint64_t Shift;
14816
14817 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
14819 Shift = 0;
14820 }
14821 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
14823 Shift = 8;
14824 }
14825
14826 if (isAdvSIMDModImm) {
14827 SDLoc DL(Op);
14828 SDValue Mov;
14829
14830 if (LHS)
14831 Mov = DAG.getNode(NewOp, DL, MovTy,
14832 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
14833 DAG.getConstant(Value, DL, MVT::i32),
14834 DAG.getConstant(Shift, DL, MVT::i32));
14835 else
14836 Mov =
14837 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14838 DAG.getConstant(Shift, DL, MVT::i32));
14839
14840 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14841 }
14842 }
14843
14844 return SDValue();
14845}
14846
14847// Try 32-bit splatted SIMD immediate with shifted ones.
14849 SelectionDAG &DAG, const APInt &Bits) {
14850 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14851 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14852 EVT VT = Op.getValueType();
14853 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14854 bool isAdvSIMDModImm = false;
14855 uint64_t Shift;
14856
14857 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
14859 Shift = 264;
14860 }
14861 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
14863 Shift = 272;
14864 }
14865
14866 if (isAdvSIMDModImm) {
14867 SDLoc DL(Op);
14868 SDValue Mov =
14869 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14870 DAG.getConstant(Shift, DL, MVT::i32));
14871 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14872 }
14873 }
14874
14875 return SDValue();
14876}
14877
14878// Try 8-bit splatted SIMD immediate.
14879static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14880 const APInt &Bits) {
14881 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14882 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14883 EVT VT = Op.getValueType();
14884 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
14885
14888
14889 SDLoc DL(Op);
14890 SDValue Mov =
14891 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14892 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14893 }
14894 }
14895
14896 return SDValue();
14897}
14898
14899// Try FP splatted SIMD immediate.
14900static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14901 const APInt &Bits) {
14902 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14903 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14904 EVT VT = Op.getValueType();
14905 bool isWide = (VT.getSizeInBits() == 128);
14906 MVT MovTy;
14907 bool isAdvSIMDModImm = false;
14908
14909 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
14911 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
14912 }
14913 else if (isWide &&
14914 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
14916 MovTy = MVT::v2f64;
14917 }
14918
14919 if (isAdvSIMDModImm) {
14920 SDLoc DL(Op);
14921 SDValue Mov =
14922 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14923 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14924 }
14925 }
14926
14927 return SDValue();
14928}
14929
14930// Specialized code to quickly find if PotentialBVec is a BuildVector that
14931// consists of only the same constant int value, returned in reference arg
14932// ConstVal
14933static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
14934 uint64_t &ConstVal) {
14935 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
14936 if (!Bvec)
14937 return false;
14939 if (!FirstElt)
14940 return false;
14941 EVT VT = Bvec->getValueType(0);
14942 unsigned NumElts = VT.getVectorNumElements();
14943 for (unsigned i = 1; i < NumElts; ++i)
14944 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
14945 return false;
14946 ConstVal = FirstElt->getZExtValue();
14947 return true;
14948}
14949
14951 // Look through cast.
14952 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
14953 N = N.getOperand(0);
14954
14955 return ISD::isConstantSplatVectorAllZeros(N.getNode());
14956}
14957
14959 unsigned NumElts = N.getValueType().getVectorMinNumElements();
14960
14961 // Look through cast.
14962 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
14963 N = N.getOperand(0);
14964 // When reinterpreting from a type with fewer elements the "new" elements
14965 // are not active, so bail if they're likely to be used.
14966 if (N.getValueType().getVectorMinNumElements() < NumElts)
14967 return false;
14968 }
14969
14970 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
14971 return true;
14972
14973 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
14974 // or smaller than the implicit element type represented by N.
14975 // NOTE: A larger element count implies a smaller element type.
14976 if (N.getOpcode() == AArch64ISD::PTRUE &&
14977 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
14978 return N.getValueType().getVectorMinNumElements() >= NumElts;
14979
14980 // If we're compiling for a specific vector-length, we can check if the
14981 // pattern's VL equals that of the scalable vector at runtime.
14982 if (N.getOpcode() == AArch64ISD::PTRUE) {
14983 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14984 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
14985 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
14986 if (MaxSVESize && MinSVESize == MaxSVESize) {
14987 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
14988 unsigned PatNumElts =
14989 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
14990 return PatNumElts == (NumElts * VScale);
14991 }
14992 }
14993
14994 return false;
14995}
14996
14997// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
14998// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
14999// BUILD_VECTORs with constant element C1, C2 is a constant, and:
15000// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
15001// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
15002// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
15004 EVT VT = N->getValueType(0);
15005
15006 if (!VT.isVector())
15007 return SDValue();
15008
15009 SDLoc DL(N);
15010
15011 SDValue And;
15012 SDValue Shift;
15013
15014 SDValue FirstOp = N->getOperand(0);
15015 unsigned FirstOpc = FirstOp.getOpcode();
15016 SDValue SecondOp = N->getOperand(1);
15017 unsigned SecondOpc = SecondOp.getOpcode();
15018
15019 // Is one of the operands an AND or a BICi? The AND may have been optimised to
15020 // a BICi in order to use an immediate instead of a register.
15021 // Is the other operand an shl or lshr? This will have been turned into:
15022 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
15023 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
15024 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
15025 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
15026 SecondOpc == AArch64ISD::SHL_PRED ||
15027 SecondOpc == AArch64ISD::SRL_PRED)) {
15028 And = FirstOp;
15029 Shift = SecondOp;
15030
15031 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
15032 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
15033 FirstOpc == AArch64ISD::SHL_PRED ||
15034 FirstOpc == AArch64ISD::SRL_PRED)) {
15035 And = SecondOp;
15036 Shift = FirstOp;
15037 } else
15038 return SDValue();
15039
15040 bool IsAnd = And.getOpcode() == ISD::AND;
15041 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
15042 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15043 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
15044 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15045
15046 // Is the shift amount constant and are all lanes active?
15047 uint64_t C2;
15048 if (ShiftHasPredOp) {
15049 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
15050 return SDValue();
15051 APInt C;
15053 return SDValue();
15054 C2 = C.getZExtValue();
15055 } else if (ConstantSDNode *C2node =
15057 C2 = C2node->getZExtValue();
15058 else
15059 return SDValue();
15060
15061 APInt C1AsAPInt;
15062 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
15063 if (IsAnd) {
15064 // Is the and mask vector all constant?
15065 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
15066 return SDValue();
15067 } else {
15068 // Reconstruct the corresponding AND immediate from the two BICi immediates.
15069 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
15070 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
15071 assert(C1nodeImm && C1nodeShift);
15072 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
15073 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
15074 }
15075
15076 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
15077 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
15078 // how much one can shift elements of a particular size?
15079 if (C2 > ElemSizeInBits)
15080 return SDValue();
15081
15082 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
15083 : APInt::getLowBitsSet(ElemSizeInBits, C2);
15084 if (C1AsAPInt != RequiredC1)
15085 return SDValue();
15086
15087 SDValue X = And.getOperand(0);
15088 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
15089 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
15090 : Shift.getOperand(1);
15091
15092 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
15093 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
15094
15095 return ResultSLI;
15096}
15097
15099 EVT VT = N->getValueType(0);
15100 assert(VT.isVector() && "Expected vector type in tryLowerToBSL\n");
15101 SDLoc DL(N);
15102 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15103
15104 if (VT.isScalableVector() && !Subtarget.hasSVE2())
15105 return SDValue();
15106
15107 SDValue N0 = N->getOperand(0);
15108 if (N0.getOpcode() != ISD::AND)
15109 return SDValue();
15110
15111 SDValue N1 = N->getOperand(1);
15112 if (N1.getOpcode() != ISD::AND)
15113 return SDValue();
15114
15115 // InstCombine does (not (neg a)) => (add a -1).
15116 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
15117 // Loop over all combinations of AND operands.
15118 for (int i = 1; i >= 0; --i) {
15119 for (int j = 1; j >= 0; --j) {
15120 SDValue O0 = N0->getOperand(i);
15121 SDValue O1 = N1->getOperand(j);
15122 SDValue Sub, Add, SubSibling, AddSibling;
15123
15124 // Find a SUB and an ADD operand, one from each AND.
15125 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
15126 Sub = O0;
15127 Add = O1;
15128 SubSibling = N0->getOperand(1 - i);
15129 AddSibling = N1->getOperand(1 - j);
15130 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
15131 Add = O0;
15132 Sub = O1;
15133 AddSibling = N0->getOperand(1 - i);
15134 SubSibling = N1->getOperand(1 - j);
15135 } else
15136 continue;
15137
15138 if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
15139 continue;
15140
15141 // Constant ones is always righthand operand of the Add.
15142 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
15143 continue;
15144
15145 if (Sub.getOperand(1) != Add.getOperand(0))
15146 continue;
15147
15148 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
15149 }
15150 }
15151
15152 // (or (and a b) (and (not a) c)) => (bsl a b c)
15153 // We only have to look for constant vectors here since the general, variable
15154 // case can be handled in TableGen.
15155 unsigned Bits = VT.getScalarSizeInBits();
15156 for (int i = 1; i >= 0; --i)
15157 for (int j = 1; j >= 0; --j) {
15158 APInt Val1, Val2;
15159
15160 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
15162 ~Val1.trunc(Bits) == Val2.trunc(Bits)) {
15163 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15164 N0->getOperand(1 - i), N1->getOperand(1 - j));
15165 }
15168 if (!BVN0 || !BVN1)
15169 continue;
15170
15171 bool FoundMatch = true;
15172 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
15175 if (!CN0 || !CN1 ||
15176 CN0->getAPIntValue().trunc(Bits) !=
15177 ~CN1->getAsAPIntVal().trunc(Bits)) {
15178 FoundMatch = false;
15179 break;
15180 }
15181 }
15182 if (FoundMatch)
15183 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15184 N0->getOperand(1 - i), N1->getOperand(1 - j));
15185 }
15186
15187 return SDValue();
15188}
15189
15190SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
15191 SelectionDAG &DAG) const {
15192 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15193 !Subtarget->isNeonAvailable()))
15194 return LowerToScalableOp(Op, DAG);
15195
15196 if (SDValue Res = tryLowerToBSL(Op, DAG))
15197 return Res;
15198
15199 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
15200 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
15201 return Res;
15202
15203 EVT VT = Op.getValueType();
15204 if (VT.isScalableVector())
15205 return Op;
15206
15207 SDValue LHS = Op.getOperand(0);
15208 BuildVectorSDNode *BVN =
15209 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
15210 if (!BVN) {
15211 // OR commutes, so try swapping the operands.
15212 LHS = Op.getOperand(1);
15213 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
15214 }
15215 if (!BVN)
15216 return Op;
15217
15218 APInt DefBits(VT.getSizeInBits(), 0);
15219 APInt UndefBits(VT.getSizeInBits(), 0);
15220 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15221 SDValue NewOp;
15222
15223 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15224 DefBits, &LHS)) ||
15225 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15226 DefBits, &LHS)))
15227 return NewOp;
15228
15229 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15230 UndefBits, &LHS)) ||
15231 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15232 UndefBits, &LHS)))
15233 return NewOp;
15234 }
15235
15236 // We can always fall back to a non-immediate OR.
15237 return Op;
15238}
15239
15240// Normalize the operands of BUILD_VECTOR. The value of constant operands will
15241// be truncated to fit element width.
15243 SelectionDAG &DAG) {
15244 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
15245 SDLoc DL(Op);
15246 EVT VT = Op.getValueType();
15247 EVT EltTy= VT.getVectorElementType();
15248
15249 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
15250 return Op;
15251
15253 for (SDValue Lane : Op->ops()) {
15254 // For integer vectors, type legalization would have promoted the
15255 // operands already. Otherwise, if Op is a floating-point splat
15256 // (with operands cast to integers), then the only possibilities
15257 // are constants and UNDEFs.
15258 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
15259 Lane = DAG.getConstant(
15260 CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
15261 DL, MVT::i32);
15262 } else if (Lane.getNode()->isUndef()) {
15263 Lane = DAG.getUNDEF(MVT::i32);
15264 } else {
15265 assert(Lane.getValueType() == MVT::i32 &&
15266 "Unexpected BUILD_VECTOR operand type");
15267 }
15268 Ops.push_back(Lane);
15269 }
15270 return DAG.getBuildVector(VT, DL, Ops);
15271}
15272
15274 const AArch64Subtarget *ST, APInt &DefBits) {
15275 EVT VT = Op.getValueType();
15276 // TODO: We should be able to support 64-bit destinations too
15277 if (!ST->hasSVE() || !VT.is128BitVector() ||
15278 DefBits.getHiBits(64) != DefBits.getLoBits(64))
15279 return SDValue();
15280
15281 // See if we can make use of the SVE dup instruction.
15282 APInt Val64 = DefBits.trunc(64);
15283 int32_t ImmVal, ShiftVal;
15284 if (!AArch64_AM::isSVECpyDupImm(64, Val64.getSExtValue(), ImmVal, ShiftVal))
15285 return SDValue();
15286
15287 SDLoc DL(Op);
15288 SDValue SplatVal = DAG.getSplatVector(MVT::nxv2i64, DL,
15289 DAG.getConstant(Val64, DL, MVT::i64));
15290 SDValue Res = convertFromScalableVector(DAG, MVT::v2i64, SplatVal);
15291 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Res);
15292}
15293
15295 const AArch64Subtarget *ST) {
15296 EVT VT = Op.getValueType();
15297 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
15298 "Expected a legal NEON vector");
15299
15300 APInt DefBits(VT.getSizeInBits(), 0);
15301 APInt UndefBits(VT.getSizeInBits(), 0);
15303 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15304 auto TryMOVIWithBits = [&](APInt DefBits) {
15305 SDValue NewOp;
15306 if ((NewOp =
15307 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
15308 (NewOp =
15309 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15310 (NewOp =
15311 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
15312 (NewOp =
15313 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15314 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
15315 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
15316 return NewOp;
15317
15318 APInt NotDefBits = ~DefBits;
15319 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
15320 NotDefBits)) ||
15321 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG,
15322 NotDefBits)) ||
15323 (NewOp =
15324 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
15325 return NewOp;
15326 return SDValue();
15327 };
15328 if (SDValue R = TryMOVIWithBits(DefBits))
15329 return R;
15330 if (SDValue R = TryMOVIWithBits(UndefBits))
15331 return R;
15332
15333 // Try to materialise the constant using SVE when available.
15334 if (SDValue R = trySVESplat64(Op, DAG, ST, DefBits))
15335 return R;
15336
15337 // See if a fneg of the constant can be materialized with a MOVI, etc
15338 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
15339 // FNegate each sub-element of the constant
15340 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
15341 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
15342 .zext(VT.getSizeInBits());
15343 APInt NegBits(VT.getSizeInBits(), 0);
15344 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
15345 for (unsigned i = 0; i < NumElts; i++)
15346 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
15347 NegBits = DefBits ^ NegBits;
15348
15349 // Try to create the new constants with MOVI, and if so generate a fneg
15350 // for it.
15351 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
15352 SDLoc DL(Op);
15353 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
15354 return DAG.getNode(
15355 AArch64ISD::NVCAST, DL, VT,
15356 DAG.getNode(ISD::FNEG, DL, VFVT,
15357 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
15358 }
15359 return SDValue();
15360 };
15361 SDValue R;
15362 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
15363 (R = TryWithFNeg(DefBits, MVT::f64)) ||
15364 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
15365 return R;
15366 }
15367
15368 return SDValue();
15369}
15370
15371SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
15372 SDValue Op, SelectionDAG &DAG) const {
15373 EVT VT = Op.getValueType();
15374 SDLoc DL(Op);
15375 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
15376 auto *BVN = cast<BuildVectorSDNode>(Op);
15377
15378 if (auto SeqInfo = BVN->isConstantSequence()) {
15379 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
15380 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
15381 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
15382 return convertFromScalableVector(DAG, VT, Seq);
15383 }
15384
15385 unsigned NumElems = VT.getVectorNumElements();
15386 if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
15387 NumElems <= 1 || BVN->isConstant())
15388 return SDValue();
15389
15390 auto IsExtractElt = [](SDValue Op) {
15391 return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
15392 };
15393
15394 // For integer types that are not already in vectors limit to at most four
15395 // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
15396 if (VT.getScalarType().isInteger() &&
15397 NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
15398 return SDValue();
15399
15400 // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
15401 SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
15403 Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
15404 return Op.isUndef() ? Undef
15405 : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
15406 ContainerVT, Undef, Op, ZeroI64);
15407 });
15408
15409 ElementCount ZipEC = ContainerVT.getVectorElementCount();
15410 while (Intermediates.size() > 1) {
15411 EVT ZipVT = getPackedSVEVectorVT(ZipEC);
15412
15413 for (unsigned I = 0; I < Intermediates.size(); I += 2) {
15414 SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
15415 SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
15416 Intermediates[I / 2] =
15417 Op1.isUndef() ? Op0
15418 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
15419 }
15420
15421 Intermediates.resize(Intermediates.size() / 2);
15422 ZipEC = ZipEC.divideCoefficientBy(2);
15423 }
15424
15425 assert(Intermediates.size() == 1);
15426 SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
15427 return convertFromScalableVector(DAG, VT, Vec);
15428}
15429
15430SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
15431 SelectionDAG &DAG) const {
15432 EVT VT = Op.getValueType();
15433
15434 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
15435 cast<BuildVectorSDNode>(Op)->isConstantSequence();
15436 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
15437 return LowerFixedLengthBuildVectorToSVE(Op, DAG);
15438
15439 // Try to build a simple constant vector.
15440 Op = NormalizeBuildVector(Op, DAG);
15441 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
15442 // abort.
15443 if (Op.getOpcode() != ISD::BUILD_VECTOR)
15444 return SDValue();
15445
15446 // Certain vector constants, used to express things like logical NOT and
15447 // arithmetic NEG, are passed through unmodified. This allows special
15448 // patterns for these operations to match, which will lower these constants
15449 // to whatever is proven necessary.
15450 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
15451 if (BVN->isConstant()) {
15452 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
15453 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
15454 APInt Val(BitSize,
15455 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
15456 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
15457 return Op;
15458 }
15459 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
15460 if (Const->isZero() && !Const->isNegative())
15461 return Op;
15462 }
15463
15464 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
15465 return V;
15466
15467 // Scan through the operands to find some interesting properties we can
15468 // exploit:
15469 // 1) If only one value is used, we can use a DUP, or
15470 // 2) if only the low element is not undef, we can just insert that, or
15471 // 3) if only one constant value is used (w/ some non-constant lanes),
15472 // we can splat the constant value into the whole vector then fill
15473 // in the non-constant lanes.
15474 // 4) FIXME: If different constant values are used, but we can intelligently
15475 // select the values we'll be overwriting for the non-constant
15476 // lanes such that we can directly materialize the vector
15477 // some other way (MOVI, e.g.), we can be sneaky.
15478 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
15479 SDLoc DL(Op);
15480 unsigned NumElts = VT.getVectorNumElements();
15481 bool isOnlyLowElement = true;
15482 bool usesOnlyOneValue = true;
15483 bool usesOnlyOneConstantValue = true;
15484 bool isConstant = true;
15485 bool AllLanesExtractElt = true;
15486 unsigned NumConstantLanes = 0;
15487 unsigned NumDifferentLanes = 0;
15488 unsigned NumUndefLanes = 0;
15489 SDValue Value;
15490 SDValue ConstantValue;
15491 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
15492 unsigned ConsecutiveValCount = 0;
15493 SDValue PrevVal;
15494 for (unsigned i = 0; i < NumElts; ++i) {
15495 SDValue V = Op.getOperand(i);
15496 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15497 AllLanesExtractElt = false;
15498 if (V.isUndef()) {
15499 ++NumUndefLanes;
15500 continue;
15501 }
15502 if (i > 0)
15503 isOnlyLowElement = false;
15504 if (!isIntOrFPConstant(V))
15505 isConstant = false;
15506
15507 if (isIntOrFPConstant(V)) {
15508 ++NumConstantLanes;
15509 if (!ConstantValue.getNode())
15510 ConstantValue = V;
15511 else if (ConstantValue != V)
15512 usesOnlyOneConstantValue = false;
15513 }
15514
15515 if (!Value.getNode())
15516 Value = V;
15517 else if (V != Value) {
15518 usesOnlyOneValue = false;
15519 ++NumDifferentLanes;
15520 }
15521
15522 if (PrevVal != V) {
15523 ConsecutiveValCount = 0;
15524 PrevVal = V;
15525 }
15526
15527 // Keep different values and its last consecutive count. For example,
15528 //
15529 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15530 // t24, t24, t24, t24, t24, t24, t24, t24
15531 // t23 = consecutive count 8
15532 // t24 = consecutive count 8
15533 // ------------------------------------------------------------------
15534 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
15535 // t24, t24, t24, t24, t24, t24, t24, t24
15536 // t23 = consecutive count 5
15537 // t24 = consecutive count 9
15538 DifferentValueMap[V] = ++ConsecutiveValCount;
15539 }
15540
15541 if (!Value.getNode()) {
15542 LLVM_DEBUG(
15543 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
15544 return DAG.getUNDEF(VT);
15545 }
15546
15547 // Convert BUILD_VECTOR where all elements but the lowest are undef into
15548 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
15549 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
15550 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
15551 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
15552 "SCALAR_TO_VECTOR node\n");
15553 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
15554 }
15555
15556 if (AllLanesExtractElt) {
15557 SDNode *Vector = nullptr;
15558 bool Even = false;
15559 bool Odd = false;
15560 // Check whether the extract elements match the Even pattern <0,2,4,...> or
15561 // the Odd pattern <1,3,5,...>.
15562 for (unsigned i = 0; i < NumElts; ++i) {
15563 SDValue V = Op.getOperand(i);
15564 const SDNode *N = V.getNode();
15565 if (!isa<ConstantSDNode>(N->getOperand(1))) {
15566 Even = false;
15567 Odd = false;
15568 break;
15569 }
15570 SDValue N0 = N->getOperand(0);
15571
15572 // All elements are extracted from the same vector.
15573 if (!Vector) {
15574 Vector = N0.getNode();
15575 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
15576 // BUILD_VECTOR.
15577 if (VT.getVectorElementType() !=
15579 break;
15580 } else if (Vector != N0.getNode()) {
15581 Odd = false;
15582 Even = false;
15583 break;
15584 }
15585
15586 // Extracted values are either at Even indices <0,2,4,...> or at Odd
15587 // indices <1,3,5,...>.
15588 uint64_t Val = N->getConstantOperandVal(1);
15589 if (Val == 2 * i) {
15590 Even = true;
15591 continue;
15592 }
15593 if (Val - 1 == 2 * i) {
15594 Odd = true;
15595 continue;
15596 }
15597
15598 // Something does not match: abort.
15599 Odd = false;
15600 Even = false;
15601 break;
15602 }
15603 if (Even || Odd) {
15604 SDValue LHS =
15606 DAG.getConstant(0, DL, MVT::i64));
15607 SDValue RHS =
15609 DAG.getConstant(NumElts, DL, MVT::i64));
15610
15611 if (Even && !Odd)
15612 return DAG.getNode(AArch64ISD::UZP1, DL, VT, LHS, RHS);
15613 if (Odd && !Even)
15614 return DAG.getNode(AArch64ISD::UZP2, DL, VT, LHS, RHS);
15615 }
15616 }
15617
15618 // Use DUP for non-constant splats. For f32 constant splats, reduce to
15619 // i32 and try again.
15620 if (usesOnlyOneValue) {
15621 if (!isConstant) {
15622 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15623 Value.getValueType() != VT) {
15624 LLVM_DEBUG(
15625 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
15626 return DAG.getNode(AArch64ISD::DUP, DL, VT, Value);
15627 }
15628
15629 // This is actually a DUPLANExx operation, which keeps everything vectory.
15630
15631 SDValue Lane = Value.getOperand(1);
15632 Value = Value.getOperand(0);
15633 if (Value.getValueSizeInBits() == 64) {
15634 LLVM_DEBUG(
15635 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
15636 "widening it\n");
15637 Value = WidenVector(Value, DAG);
15638 }
15639
15640 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
15641 return DAG.getNode(Opcode, DL, VT, Value, Lane);
15642 }
15643
15646 EVT EltTy = VT.getVectorElementType();
15647 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
15648 EltTy == MVT::f64) && "Unsupported floating-point vector type");
15649 LLVM_DEBUG(
15650 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
15651 "BITCASTS, and try again\n");
15652 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
15653 for (unsigned i = 0; i < NumElts; ++i)
15654 Ops.push_back(DAG.getNode(ISD::BITCAST, DL, NewType, Op.getOperand(i)));
15655 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
15656 SDValue Val = DAG.getBuildVector(VecVT, DL, Ops);
15657 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
15658 Val.dump(););
15659 Val = LowerBUILD_VECTOR(Val, DAG);
15660 if (Val.getNode())
15661 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
15662 }
15663 }
15664
15665 // If we need to insert a small number of different non-constant elements and
15666 // the vector width is sufficiently large, prefer using DUP with the common
15667 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
15668 // skip the constant lane handling below.
15669 bool PreferDUPAndInsert =
15670 !isConstant && NumDifferentLanes >= 1 &&
15671 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
15672 NumDifferentLanes >= NumConstantLanes;
15673
15674 // If there was only one constant value used and for more than one lane,
15675 // start by splatting that value, then replace the non-constant lanes. This
15676 // is better than the default, which will perform a separate initialization
15677 // for each lane.
15678 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
15679 // Firstly, try to materialize the splat constant.
15680 SDValue Val = DAG.getSplatBuildVector(VT, DL, ConstantValue);
15681 unsigned BitSize = VT.getScalarSizeInBits();
15682 APInt ConstantValueAPInt(1, 0);
15683 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
15684 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
15685 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
15686 !ConstantValueAPInt.isAllOnes()) {
15687 Val = ConstantBuildVector(Val, DAG, Subtarget);
15688 if (!Val)
15689 // Otherwise, materialize the constant and splat it.
15690 Val = DAG.getNode(AArch64ISD::DUP, DL, VT, ConstantValue);
15691 }
15692
15693 // Now insert the non-constant lanes.
15694 for (unsigned i = 0; i < NumElts; ++i) {
15695 SDValue V = Op.getOperand(i);
15696 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
15697 if (!isIntOrFPConstant(V) && !V.isUndef())
15698 // Note that type legalization likely mucked about with the VT of the
15699 // source operand, so we may have to convert it here before inserting.
15700 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Val, V, LaneIdx);
15701 }
15702 return Val;
15703 }
15704
15705 // This will generate a load from the constant pool.
15706 if (isConstant) {
15707 LLVM_DEBUG(
15708 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
15709 "expansion\n");
15710 return SDValue();
15711 }
15712
15713 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
15714 // v4i32s. This is really a truncate, which we can construct out of (legal)
15715 // concats and truncate nodes.
15717 return M;
15718
15719 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
15720 if (NumElts >= 4) {
15721 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
15722 return Shuffle;
15723
15724 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
15725 return Shuffle;
15726 }
15727
15728 if (PreferDUPAndInsert) {
15729 // First, build a constant vector with the common element.
15731 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, DL, Ops), DAG);
15732 // Next, insert the elements that do not match the common value.
15733 for (unsigned I = 0; I < NumElts; ++I)
15734 if (Op.getOperand(I) != Value)
15735 NewVector =
15736 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NewVector,
15737 Op.getOperand(I), DAG.getConstant(I, DL, MVT::i64));
15738
15739 return NewVector;
15740 }
15741
15742 // If vector consists of two different values, try to generate two DUPs and
15743 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
15744 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
15746 // Check the consecutive count of the value is the half number of vector
15747 // elements. In this case, we can use CONCAT_VECTORS. For example,
15748 //
15749 // canUseVECTOR_CONCAT = true;
15750 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15751 // t24, t24, t24, t24, t24, t24, t24, t24
15752 //
15753 // canUseVECTOR_CONCAT = false;
15754 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
15755 // t24, t24, t24, t24, t24, t24, t24, t24
15756 bool canUseVECTOR_CONCAT = true;
15757 for (auto Pair : DifferentValueMap) {
15758 // Check different values have same length which is NumElts / 2.
15759 if (Pair.second != NumElts / 2)
15760 canUseVECTOR_CONCAT = false;
15761 Vals.push_back(Pair.first);
15762 }
15763
15764 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
15765 // CONCAT_VECTORs. For example,
15766 //
15767 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
15768 // t24, t24, t24, t24, t24, t24, t24, t24
15769 // ==>
15770 // t26: v8i8 = AArch64ISD::DUP t23
15771 // t28: v8i8 = AArch64ISD::DUP t24
15772 // t29: v16i8 = concat_vectors t26, t28
15773 if (canUseVECTOR_CONCAT) {
15774 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15775 if (isTypeLegal(SubVT) && SubVT.isVector() &&
15776 SubVT.getVectorNumElements() >= 2) {
15777 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
15778 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
15779 SDValue DUP1 =
15780 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops1), DAG);
15781 SDValue DUP2 =
15782 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops2), DAG);
15784 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, DUP1, DUP2);
15785 return CONCAT_VECTORS;
15786 }
15787 }
15788
15789 // Let's try to generate VECTOR_SHUFFLE. For example,
15790 //
15791 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
15792 // ==>
15793 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
15794 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
15795 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
15796 if (NumElts >= 8) {
15797 SmallVector<int, 16> MaskVec;
15798 // Build mask for VECTOR_SHUFLLE.
15799 SDValue FirstLaneVal = Op.getOperand(0);
15800 for (unsigned i = 0; i < NumElts; ++i) {
15801 SDValue Val = Op.getOperand(i);
15802 if (FirstLaneVal == Val)
15803 MaskVec.push_back(i);
15804 else
15805 MaskVec.push_back(i + NumElts);
15806 }
15807
15808 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
15809 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
15810 SDValue VEC1 = DAG.getBuildVector(VT, DL, Ops1);
15811 SDValue VEC2 = DAG.getBuildVector(VT, DL, Ops2);
15813 DAG.getVectorShuffle(VT, DL, VEC1, VEC2, MaskVec);
15814 return VECTOR_SHUFFLE;
15815 }
15816 }
15817
15818 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
15819 // know the default expansion would otherwise fall back on something even
15820 // worse. For a vector with one or two non-undef values, that's
15821 // scalar_to_vector for the elements followed by a shuffle (provided the
15822 // shuffle is valid for the target) and materialization element by element
15823 // on the stack followed by a load for everything else.
15824 if (!isConstant && !usesOnlyOneValue) {
15825 LLVM_DEBUG(
15826 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
15827 "of INSERT_VECTOR_ELT\n");
15828
15829 SDValue Vec = DAG.getUNDEF(VT);
15830 SDValue Op0 = Op.getOperand(0);
15831 unsigned i = 0;
15832
15833 // Use SCALAR_TO_VECTOR for lane zero to
15834 // a) Avoid a RMW dependency on the full vector register, and
15835 // b) Allow the register coalescer to fold away the copy if the
15836 // value is already in an S or D register, and we're forced to emit an
15837 // INSERT_SUBREG that we can't fold anywhere.
15838 //
15839 // We also allow types like i8 and i16 which are illegal scalar but legal
15840 // vector element types. After type-legalization the inserted value is
15841 // extended (i32) and it is safe to cast them to the vector type by ignoring
15842 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
15843 if (!Op0.isUndef()) {
15844 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
15845 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Op0);
15846 ++i;
15847 }
15848 LLVM_DEBUG({
15849 if (i < NumElts)
15850 dbgs() << "Creating nodes for the other vector elements:\n";
15851 });
15852 for (; i < NumElts; ++i) {
15853 SDValue V = Op.getOperand(i);
15854 if (V.isUndef())
15855 continue;
15856 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
15857 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
15858 }
15859 return Vec;
15860 }
15861
15862 LLVM_DEBUG(
15863 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
15864 "better alternative\n");
15865 return SDValue();
15866}
15867
15868SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
15869 SelectionDAG &DAG) const {
15870 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15871 !Subtarget->isNeonAvailable()))
15872 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
15873
15874 assert(Op.getValueType().isScalableVector() &&
15875 isTypeLegal(Op.getValueType()) &&
15876 "Expected legal scalable vector type!");
15877
15878 if (isTypeLegal(Op.getOperand(0).getValueType())) {
15879 unsigned NumOperands = Op->getNumOperands();
15880 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
15881 "Unexpected number of operands in CONCAT_VECTORS");
15882
15883 if (NumOperands == 2)
15884 return Op;
15885
15886 // Concat each pair of subvectors and pack into the lower half of the array.
15887 SmallVector<SDValue> ConcatOps(Op->ops());
15888 while (ConcatOps.size() > 1) {
15889 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
15890 SDValue V1 = ConcatOps[I];
15891 SDValue V2 = ConcatOps[I + 1];
15892 EVT SubVT = V1.getValueType();
15893 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
15894 ConcatOps[I / 2] =
15895 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
15896 }
15897 ConcatOps.resize(ConcatOps.size() / 2);
15898 }
15899 return ConcatOps[0];
15900 }
15901
15902 return SDValue();
15903}
15904
15905SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15906 SelectionDAG &DAG) const {
15907 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
15908
15909 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15910 !Subtarget->isNeonAvailable()))
15911 return LowerFixedLengthInsertVectorElt(Op, DAG);
15912
15913 EVT VT = Op.getOperand(0).getValueType();
15914
15915 if (VT.getScalarType() == MVT::i1) {
15916 EVT VectorVT = getPromotedVTForPredicate(VT);
15917 SDLoc DL(Op);
15918 SDValue ExtendedVector =
15919 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
15920 SDValue ExtendedValue =
15921 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
15922 VectorVT.getScalarType().getSizeInBits() < 32
15923 ? MVT::i32
15924 : VectorVT.getScalarType());
15925 ExtendedVector =
15926 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
15927 ExtendedValue, Op.getOperand(2));
15928 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
15929 }
15930
15931 // Check for non-constant or out of range lane.
15932 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
15933 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15934 return SDValue();
15935
15936 return Op;
15937}
15938
15939SDValue
15940AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
15941 SelectionDAG &DAG) const {
15942 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
15943 EVT VT = Op.getOperand(0).getValueType();
15944
15945 if (VT.getScalarType() == MVT::i1) {
15946 // We can't directly extract from an SVE predicate; extend it first.
15947 // (This isn't the only possible lowering, but it's straightforward.)
15948 EVT VectorVT = getPromotedVTForPredicate(VT);
15949 SDLoc DL(Op);
15950 SDValue Extend =
15951 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
15952 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
15953 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
15954 Extend, Op.getOperand(1));
15955 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
15956 }
15957
15958 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15959 return LowerFixedLengthExtractVectorElt(Op, DAG);
15960
15961 // Check for non-constant or out of range lane.
15962 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
15963 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
15964 return SDValue();
15965
15966 // Insertion/extraction are legal for V128 types.
15967 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
15968 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
15969 VT == MVT::v8f16 || VT == MVT::v8bf16)
15970 return Op;
15971
15972 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
15973 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
15974 VT != MVT::v4bf16)
15975 return SDValue();
15976
15977 // For V64 types, we perform extraction by expanding the value
15978 // to a V128 type and perform the extraction on that.
15979 SDLoc DL(Op);
15980 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
15981 EVT WideTy = WideVec.getValueType();
15982
15983 EVT ExtrTy = WideTy.getVectorElementType();
15984 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
15985 ExtrTy = MVT::i32;
15986
15987 // For extractions, we just return the result directly.
15988 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
15989 Op.getOperand(1));
15990}
15991
15992SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
15993 SelectionDAG &DAG) const {
15994 EVT VT = Op.getValueType();
15996 "Only cases that extract a fixed length vector are supported!");
15997 EVT InVT = Op.getOperand(0).getValueType();
15998
15999 // If we don't have legal types yet, do nothing
16000 if (!isTypeLegal(InVT))
16001 return SDValue();
16002
16003 if (InVT.is128BitVector()) {
16004 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
16005 unsigned Idx = Op.getConstantOperandVal(1);
16006
16007 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
16008 if (Idx == 0)
16009 return Op;
16010
16011 // If this is extracting the upper 64-bits of a 128-bit vector, we match
16012 // that directly.
16013 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
16014 return Op;
16015 }
16016
16017 if (InVT.isScalableVector() ||
16018 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
16019 SDLoc DL(Op);
16020 SDValue Vec = Op.getOperand(0);
16021 SDValue Idx = Op.getOperand(1);
16022
16023 EVT PackedVT = getPackedSVEVectorVT(InVT.getVectorElementType());
16024 if (PackedVT != InVT) {
16025 // Pack input into the bottom part of an SVE register and try again.
16026 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
16027 DAG.getUNDEF(PackedVT), Vec,
16028 DAG.getVectorIdxConstant(0, DL));
16029 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
16030 }
16031
16032 // This will get matched by custom code during ISelDAGToDAG.
16033 if (isNullConstant(Idx))
16034 return Op;
16035
16036 assert(InVT.isScalableVector() && "Unexpected vector type!");
16037 // Move requested subvector to the start of the vector and try again.
16038 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
16039 return convertFromScalableVector(DAG, VT, Splice);
16040 }
16041
16042 return SDValue();
16043}
16044
16045SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
16046 SelectionDAG &DAG) const {
16047 assert(Op.getValueType().isScalableVector() &&
16048 "Only expect to lower inserts into scalable vectors!");
16049
16050 EVT InVT = Op.getOperand(1).getValueType();
16051 unsigned Idx = Op.getConstantOperandVal(2);
16052
16053 SDValue Vec0 = Op.getOperand(0);
16054 SDValue Vec1 = Op.getOperand(1);
16055 SDLoc DL(Op);
16056 EVT VT = Op.getValueType();
16057
16058 if (InVT.isScalableVector()) {
16059 if (!isTypeLegal(VT))
16060 return SDValue();
16061
16062 // Break down insert_subvector into simpler parts.
16063 if (VT.getVectorElementType() == MVT::i1) {
16064 unsigned NumElts = VT.getVectorMinNumElements();
16065 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16066
16067 SDValue Lo, Hi;
16068 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16069 DAG.getVectorIdxConstant(0, DL));
16070 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16071 DAG.getVectorIdxConstant(NumElts / 2, DL));
16072 if (Idx < (NumElts / 2))
16073 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
16074 DAG.getVectorIdxConstant(Idx, DL));
16075 else
16076 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
16077 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
16078
16079 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16080 }
16081
16082 // We can select these directly.
16083 if (isTypeLegal(InVT) && Vec0.isUndef())
16084 return Op;
16085
16086 // Ensure the subvector is half the size of the main vector.
16087 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
16088 return SDValue();
16089
16090 // Here narrow and wide refers to the vector element types. After "casting"
16091 // both vectors must have the same bit length and so because the subvector
16092 // has fewer elements, those elements need to be bigger.
16093 EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
16094 EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
16095
16096 // NOP cast operands to the largest legal vector of the same element count.
16097 if (VT.isFloatingPoint()) {
16098 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
16099 Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
16100 } else {
16101 // Legal integer vectors are already their largest so Vec0 is fine as is.
16102 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
16103 Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
16104 }
16105
16106 // To replace the top/bottom half of vector V with vector SubV we widen the
16107 // preserved half of V, concatenate this to SubV (the order depending on the
16108 // half being replaced) and then narrow the result.
16109 SDValue Narrow;
16110 if (Idx == 0) {
16111 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
16112 HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
16113 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
16114 } else {
16115 assert(Idx == InVT.getVectorMinNumElements() &&
16116 "Invalid subvector index!");
16117 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
16118 LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
16119 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
16120 }
16121
16122 return getSVESafeBitCast(VT, Narrow, DAG);
16123 }
16124
16125 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
16126 // This will be matched by custom code during ISelDAGToDAG.
16127 if (Vec0.isUndef())
16128 return Op;
16129
16130 std::optional<unsigned> PredPattern =
16132 auto PredTy = VT.changeVectorElementType(MVT::i1);
16133 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
16134 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
16135 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
16136 }
16137
16138 return SDValue();
16139}
16140
16141static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
16142 if (Op.getOpcode() != AArch64ISD::DUP &&
16143 Op.getOpcode() != ISD::SPLAT_VECTOR &&
16144 Op.getOpcode() != ISD::BUILD_VECTOR)
16145 return false;
16146
16147 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
16148 !isAllConstantBuildVector(Op, SplatVal))
16149 return false;
16150
16151 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
16152 !isa<ConstantSDNode>(Op->getOperand(0)))
16153 return false;
16154
16155 SplatVal = Op->getConstantOperandVal(0);
16156 if (Op.getValueType().getVectorElementType() != MVT::i64)
16157 SplatVal = (int32_t)SplatVal;
16158
16159 Negated = false;
16160 if (isPowerOf2_64(SplatVal))
16161 return true;
16162
16163 Negated = true;
16164 if (isPowerOf2_64(-SplatVal)) {
16165 SplatVal = -SplatVal;
16166 return true;
16167 }
16168
16169 return false;
16170}
16171
16172SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
16173 EVT VT = Op.getValueType();
16174 SDLoc DL(Op);
16175
16176 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
16177 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
16178
16179 assert(VT.isScalableVector() && "Expected a scalable vector.");
16180
16181 bool Signed = Op.getOpcode() == ISD::SDIV;
16182 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
16183
16184 bool Negated;
16185 uint64_t SplatVal;
16186 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
16188 SDValue Res =
16189 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
16190 DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32));
16191 if (Negated)
16192 Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
16193
16194 return Res;
16195 }
16196
16197 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
16198 return LowerToPredicatedOp(Op, DAG, PredOpcode);
16199
16200 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
16201 // operations, and truncate the result.
16202 EVT WidenedVT;
16203 if (VT == MVT::nxv16i8)
16204 WidenedVT = MVT::nxv8i16;
16205 else if (VT == MVT::nxv8i16)
16206 WidenedVT = MVT::nxv4i32;
16207 else
16208 llvm_unreachable("Unexpected Custom DIV operation");
16209
16210 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
16211 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
16212 SDValue Op0Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(0));
16213 SDValue Op1Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(1));
16214 SDValue Op0Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(0));
16215 SDValue Op1Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(1));
16216 SDValue ResultLo = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Lo, Op1Lo);
16217 SDValue ResultHi = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Hi, Op1Hi);
16218 SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultLo);
16219 SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultHi);
16220 return DAG.getNode(AArch64ISD::UZP1, DL, VT, ResultLoCast, ResultHiCast);
16221}
16222
16223bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
16224 EVT VT, unsigned DefinedValues) const {
16225 if (!Subtarget->isNeonAvailable())
16226 return false;
16228}
16229
16231 // Currently no fixed length shuffles that require SVE are legal.
16232 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16233 return false;
16234
16235 if (VT.getVectorNumElements() == 4 &&
16236 (VT.is128BitVector() || VT.is64BitVector())) {
16237 unsigned Cost = getPerfectShuffleCost(M);
16238 if (Cost <= 1)
16239 return true;
16240 }
16241
16242 bool DummyBool;
16243 int DummyInt;
16244 unsigned DummyUnsigned;
16245
16246 unsigned EltSize = VT.getScalarSizeInBits();
16247 unsigned NumElts = VT.getVectorNumElements();
16249 isREVMask(M, EltSize, NumElts, 64) ||
16250 isREVMask(M, EltSize, NumElts, 32) ||
16251 isREVMask(M, EltSize, NumElts, 16) ||
16252 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
16253 isSingletonEXTMask(M, VT, DummyUnsigned) ||
16254 isTRNMask(M, NumElts, DummyUnsigned) ||
16255 isUZPMask(M, NumElts, DummyUnsigned) ||
16256 isZIPMask(M, NumElts, DummyUnsigned) ||
16257 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
16258 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
16259 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
16260 isINSMask(M, NumElts, DummyBool, DummyInt) ||
16261 isConcatMask(M, VT, VT.getSizeInBits() == 128));
16262}
16263
16265 EVT VT) const {
16266 // Just delegate to the generic legality, clear masks aren't special.
16267 return isShuffleMaskLegal(M, VT);
16268}
16269
16270/// getVShiftImm - Check if this is a valid build_vector for the immediate
16271/// operand of a vector shift operation, where all the elements of the
16272/// build_vector must have the same constant integer value.
16273static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
16274 // Ignore bit_converts.
16275 while (Op.getOpcode() == ISD::BITCAST)
16276 Op = Op.getOperand(0);
16278 APInt SplatBits, SplatUndef;
16279 unsigned SplatBitSize;
16280 bool HasAnyUndefs;
16281 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
16282 HasAnyUndefs, ElementBits) ||
16283 SplatBitSize > ElementBits)
16284 return false;
16285 Cnt = SplatBits.getSExtValue();
16286 return true;
16287}
16288
16289/// isVShiftLImm - Check if this is a valid build_vector for the immediate
16290/// operand of a vector shift left operation. That value must be in the range:
16291/// 0 <= Value < ElementBits for a left shift; or
16292/// 0 <= Value <= ElementBits for a long left shift.
16293static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
16294 assert(VT.isVector() && "vector shift count is not a vector type");
16295 int64_t ElementBits = VT.getScalarSizeInBits();
16296 if (!getVShiftImm(Op, ElementBits, Cnt))
16297 return false;
16298 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
16299}
16300
16301/// isVShiftRImm - Check if this is a valid build_vector for the immediate
16302/// operand of a vector shift right operation. The value must be in the range:
16303/// 1 <= Value <= ElementBits for a right shift; or
16304static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
16305 assert(VT.isVector() && "vector shift count is not a vector type");
16306 int64_t ElementBits = VT.getScalarSizeInBits();
16307 if (!getVShiftImm(Op, ElementBits, Cnt))
16308 return false;
16309 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
16310}
16311
16312SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
16313 SelectionDAG &DAG) const {
16314 EVT VT = Op.getValueType();
16315
16316 if (VT.getScalarType() == MVT::i1) {
16317 // Lower i1 truncate to `(x & 1) != 0`.
16318 SDLoc DL(Op);
16319 EVT OpVT = Op.getOperand(0).getValueType();
16320 SDValue Zero = DAG.getConstant(0, DL, OpVT);
16321 SDValue One = DAG.getConstant(1, DL, OpVT);
16322 SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Op.getOperand(0), One);
16323 return DAG.getSetCC(DL, VT, And, Zero, ISD::SETNE);
16324 }
16325
16326 if (!VT.isVector() || VT.isScalableVector())
16327 return SDValue();
16328
16329 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16330 !Subtarget->isNeonAvailable()))
16331 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
16332
16333 return SDValue();
16334}
16335
16336// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
16337// possibly a truncated type, it tells how many bits of the value are to be
16338// used.
16340 SelectionDAG &DAG,
16341 unsigned &ShiftValue,
16342 SDValue &RShOperand) {
16343 if (Shift->getOpcode() != ISD::SRL)
16344 return false;
16345
16346 EVT VT = Shift.getValueType();
16347 assert(VT.isScalableVT());
16348
16349 auto ShiftOp1 =
16351 if (!ShiftOp1)
16352 return false;
16353
16354 ShiftValue = ShiftOp1->getZExtValue();
16355 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
16356 return false;
16357
16358 SDValue Add = Shift->getOperand(0);
16359 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
16360 return false;
16361
16363 "ResVT must be truncated or same type as the shift.");
16364 // Check if an overflow can lead to incorrect results.
16365 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
16366 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
16367 return false;
16368
16369 auto AddOp1 =
16371 if (!AddOp1)
16372 return false;
16373 uint64_t AddValue = AddOp1->getZExtValue();
16374 if (AddValue != 1ULL << (ShiftValue - 1))
16375 return false;
16376
16377 RShOperand = Add->getOperand(0);
16378 return true;
16379}
16380
16381SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
16382 SelectionDAG &DAG) const {
16383 EVT VT = Op.getValueType();
16384 SDLoc DL(Op);
16385 int64_t Cnt;
16386
16387 if (!Op.getOperand(1).getValueType().isVector())
16388 return Op;
16389 unsigned EltSize = VT.getScalarSizeInBits();
16390
16391 switch (Op.getOpcode()) {
16392 case ISD::SHL:
16393 if (VT.isScalableVector() ||
16394 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16395 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
16396
16397 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
16398 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
16399 DAG.getConstant(Cnt, DL, MVT::i32));
16400 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
16401 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
16402 MVT::i32),
16403 Op.getOperand(0), Op.getOperand(1));
16404 case ISD::SRA:
16405 case ISD::SRL:
16406 if (VT.isScalableVector() &&
16407 (Subtarget->hasSVE2() ||
16408 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
16409 SDValue RShOperand;
16410 unsigned ShiftValue;
16411 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
16412 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
16413 getPredicateForVector(DAG, DL, VT), RShOperand,
16414 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
16415 }
16416
16417 if (VT.isScalableVector() ||
16418 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
16419 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
16420 : AArch64ISD::SRL_PRED;
16421 return LowerToPredicatedOp(Op, DAG, Opc);
16422 }
16423
16424 // Right shift immediate
16425 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
16426 unsigned Opc =
16427 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
16428 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
16429 DAG.getConstant(Cnt, DL, MVT::i32), Op->getFlags());
16430 }
16431
16432 // Right shift register. Note, there is not a shift right register
16433 // instruction, but the shift left register instruction takes a signed
16434 // value, where negative numbers specify a right shift.
16435 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
16436 : Intrinsic::aarch64_neon_ushl;
16437 // negate the shift amount
16438 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
16439 Op.getOperand(1));
16440 SDValue NegShiftLeft =
16442 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
16443 NegShift);
16444 return NegShiftLeft;
16445 }
16446
16447 llvm_unreachable("unexpected shift opcode");
16448}
16449
16450SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
16451 SelectionDAG &DAG) const {
16452 if (Op.getValueType().isScalableVector())
16453 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
16454
16455 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16456 !Subtarget->isNeonAvailable()))
16457 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
16458
16459 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
16460 SDValue LHS = Op.getOperand(0);
16461 SDValue RHS = Op.getOperand(1);
16462 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
16463 SDLoc DL(Op);
16464
16465 if (LHS.getValueType().getVectorElementType().isInteger())
16466 return Op;
16467
16468 assert(((!Subtarget->hasFullFP16() &&
16469 LHS.getValueType().getVectorElementType() != MVT::f16) ||
16470 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
16471 LHS.getValueType().getVectorElementType() != MVT::f128) &&
16472 "Unexpected type!");
16473
16474 // Lower isnan(x) | isnan(never-nan) to x != x.
16475 // Lower !isnan(x) & !isnan(never-nan) to x == x.
16476 if (CC == ISD::SETUO || CC == ISD::SETO) {
16477 bool OneNaN = false;
16478 if (LHS == RHS) {
16479 OneNaN = true;
16480 } else if (DAG.isKnownNeverNaN(RHS)) {
16481 OneNaN = true;
16482 RHS = LHS;
16483 } else if (DAG.isKnownNeverNaN(LHS)) {
16484 OneNaN = true;
16485 LHS = RHS;
16486 }
16487 if (OneNaN) {
16488 CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
16489 }
16490 }
16491
16492 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
16493 // clean. Some of them require two branches to implement.
16494 AArch64CC::CondCode CC1, CC2;
16495 bool ShouldInvert;
16496 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
16497
16498 bool NoNaNs =
16499 getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
16500 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, DL, DAG);
16501 if (!Cmp.getNode())
16502 return SDValue();
16503
16504 if (CC2 != AArch64CC::AL) {
16505 SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, DL, DAG);
16506 if (!Cmp2.getNode())
16507 return SDValue();
16508
16509 Cmp = DAG.getNode(ISD::OR, DL, CmpVT, Cmp, Cmp2);
16510 }
16511
16512 Cmp = DAG.getSExtOrTrunc(Cmp, DL, Op.getValueType());
16513
16514 if (ShouldInvert)
16515 Cmp = DAG.getNOT(DL, Cmp, Cmp.getValueType());
16516
16517 return Cmp;
16518}
16519
16520static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
16521 SelectionDAG &DAG) {
16522 SDValue VecOp = ScalarOp.getOperand(0);
16523 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
16524 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
16525 DAG.getConstant(0, DL, MVT::i64));
16526}
16527
16528static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
16529 SDLoc DL, SelectionDAG &DAG) {
16530 unsigned ScalarOpcode;
16531 switch (Opcode) {
16532 case ISD::VECREDUCE_AND:
16533 ScalarOpcode = ISD::AND;
16534 break;
16535 case ISD::VECREDUCE_OR:
16536 ScalarOpcode = ISD::OR;
16537 break;
16538 case ISD::VECREDUCE_XOR:
16539 ScalarOpcode = ISD::XOR;
16540 break;
16541 default:
16542 llvm_unreachable("Expected bitwise vector reduction");
16543 return SDValue();
16544 }
16545
16546 EVT VecVT = Vec.getValueType();
16547 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
16548 "Expected power-of-2 length vector");
16549
16550 EVT ElemVT = VecVT.getVectorElementType();
16551
16552 SDValue Result;
16553 unsigned NumElems = VecVT.getVectorNumElements();
16554
16555 // Special case for boolean reductions
16556 if (ElemVT == MVT::i1) {
16557 // Split large vectors into smaller ones
16558 if (NumElems > 16) {
16559 SDValue Lo, Hi;
16560 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16561 EVT HalfVT = Lo.getValueType();
16562 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
16563 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
16564 }
16565
16566 // Results of setcc operations get widened to 128 bits if their input
16567 // operands are 128 bits wide, otherwise vectors that are less than 64 bits
16568 // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
16569 // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
16570 // size leads to the best codegen, since e.g. setcc results might need to be
16571 // truncated otherwise.
16572 unsigned ExtendedWidth = 64;
16573 if (Vec.getOpcode() == ISD::SETCC &&
16574 Vec.getOperand(0).getValueSizeInBits() >= 128) {
16575 ExtendedWidth = 128;
16576 }
16577 EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
16578
16579 // any_ext doesn't work with umin/umax, so only use it for uadd.
16580 unsigned ExtendOp =
16581 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
16582 SDValue Extended = DAG.getNode(
16583 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
16584 // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
16585 // in that case we bitcast the sign extended values from v2i64 to v4i32
16586 // before reduction for optimal code generation.
16587 if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
16588 NumElems == 2 && ExtendedWidth == 128) {
16589 Extended = DAG.getBitcast(MVT::v4i32, Extended);
16590 ExtendedVT = MVT::i32;
16591 }
16592 switch (ScalarOpcode) {
16593 case ISD::AND:
16594 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
16595 break;
16596 case ISD::OR:
16597 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
16598 break;
16599 case ISD::XOR:
16600 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
16601 break;
16602 default:
16603 llvm_unreachable("Unexpected Opcode");
16604 }
16605
16606 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
16607 } else {
16608 // Iteratively split the vector in half and combine using the bitwise
16609 // operation until it fits in a 64 bit register.
16610 while (VecVT.getSizeInBits() > 64) {
16611 SDValue Lo, Hi;
16612 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16613 VecVT = Lo.getValueType();
16614 NumElems = VecVT.getVectorNumElements();
16615 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
16616 }
16617
16618 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
16619
16620 // Do the remaining work on a scalar since it allows the code generator to
16621 // combine the shift and bitwise operation into one instruction and since
16622 // integer instructions can have higher throughput than vector instructions.
16623 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
16624
16625 // Iteratively combine the lower and upper halves of the scalar using the
16626 // bitwise operation, halving the relevant region of the scalar in each
16627 // iteration, until the relevant region is just one element of the original
16628 // vector.
16629 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
16630 SDValue ShiftAmount =
16631 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
16632 SDValue Shifted =
16633 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
16634 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
16635 }
16636
16637 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
16638 }
16639
16640 return DAG.getAnyExtOrTrunc(Result, DL, VT);
16641}
16642
16643SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
16644 SelectionDAG &DAG) const {
16645 SDValue Src = Op.getOperand(0);
16646 EVT SrcVT = Src.getValueType();
16647
16648 // Scalarize v2f16 to turn it into a faddp. This will be more efficient than
16649 // widening by inserting zeroes.
16650 if (Subtarget->hasFullFP16() && Op.getOpcode() == ISD::VECREDUCE_FADD &&
16651 SrcVT == MVT::v2f16) {
16652 SDLoc DL(Op);
16653 return DAG.getNode(ISD::FADD, DL, MVT::f16,
16654 DAG.getExtractVectorElt(DL, MVT::f16, Src, 0),
16655 DAG.getExtractVectorElt(DL, MVT::f16, Src, 1));
16656 }
16657
16658 // Try to lower fixed length reductions to SVE.
16659 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
16660 Op.getOpcode() == ISD::VECREDUCE_AND ||
16661 Op.getOpcode() == ISD::VECREDUCE_OR ||
16662 Op.getOpcode() == ISD::VECREDUCE_XOR ||
16663 Op.getOpcode() == ISD::VECREDUCE_FADD ||
16664 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
16665 SrcVT.getVectorElementType() == MVT::i64);
16666 if (SrcVT.isScalableVector() ||
16668 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
16669
16670 if (SrcVT.getVectorElementType() == MVT::i1)
16671 return LowerPredReductionToSVE(Op, DAG);
16672
16673 switch (Op.getOpcode()) {
16674 case ISD::VECREDUCE_ADD:
16675 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
16676 case ISD::VECREDUCE_AND:
16677 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
16678 case ISD::VECREDUCE_OR:
16679 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
16680 case ISD::VECREDUCE_SMAX:
16681 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
16682 case ISD::VECREDUCE_SMIN:
16683 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
16684 case ISD::VECREDUCE_UMAX:
16685 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
16686 case ISD::VECREDUCE_UMIN:
16687 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
16688 case ISD::VECREDUCE_XOR:
16689 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
16690 case ISD::VECREDUCE_FADD:
16691 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
16692 case ISD::VECREDUCE_FMAX:
16693 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
16694 case ISD::VECREDUCE_FMIN:
16695 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
16696 case ISD::VECREDUCE_FMAXIMUM:
16697 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
16698 case ISD::VECREDUCE_FMINIMUM:
16699 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
16700 default:
16701 llvm_unreachable("Unhandled fixed length reduction");
16702 }
16703 }
16704
16705 // Lower NEON reductions.
16706 SDLoc DL(Op);
16707 switch (Op.getOpcode()) {
16708 case ISD::VECREDUCE_AND:
16709 case ISD::VECREDUCE_OR:
16710 case ISD::VECREDUCE_XOR:
16711 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
16712 Op.getValueType(), DL, DAG);
16713 case ISD::VECREDUCE_ADD:
16714 return getReductionSDNode(AArch64ISD::UADDV, DL, Op, DAG);
16715 case ISD::VECREDUCE_SMAX:
16716 return getReductionSDNode(AArch64ISD::SMAXV, DL, Op, DAG);
16717 case ISD::VECREDUCE_SMIN:
16718 return getReductionSDNode(AArch64ISD::SMINV, DL, Op, DAG);
16719 case ISD::VECREDUCE_UMAX:
16720 return getReductionSDNode(AArch64ISD::UMAXV, DL, Op, DAG);
16721 case ISD::VECREDUCE_UMIN:
16722 return getReductionSDNode(AArch64ISD::UMINV, DL, Op, DAG);
16723 default:
16724 llvm_unreachable("Unhandled reduction");
16725 }
16726}
16727
16728SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
16729 SelectionDAG &DAG) const {
16730 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
16731 // No point replacing if we don't have the relevant instruction/libcall anyway
16732 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
16733 return SDValue();
16734
16735 // LSE has an atomic load-clear instruction, but not a load-and.
16736 SDLoc DL(Op);
16737 MVT VT = Op.getSimpleValueType();
16738 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
16739 SDValue RHS = Op.getOperand(2);
16740 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
16741 RHS = DAG.getNode(ISD::XOR, DL, VT, DAG.getAllOnesConstant(DL, VT), RHS);
16742 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, DL, AN->getMemoryVT(),
16743 Op.getOperand(0), Op.getOperand(1), RHS,
16744 AN->getMemOperand());
16745}
16746
16747SDValue
16748AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
16749 SelectionDAG &DAG) const {
16750
16751 SDLoc DL(Op);
16752 // Get the inputs.
16753 SDNode *Node = Op.getNode();
16754 SDValue Chain = Op.getOperand(0);
16755 SDValue Size = Op.getOperand(1);
16756 MaybeAlign Align =
16757 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16758 EVT VT = Node->getValueType(0);
16759
16761 "no-stack-arg-probe")) {
16762 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16763 Chain = SP.getValue(1);
16764 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16765 if (Align)
16766 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16767 DAG.getSignedConstant(-Align->value(), DL, VT));
16768 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
16769 SDValue Ops[2] = {SP, Chain};
16770 return DAG.getMergeValues(Ops, DL);
16771 }
16772
16773 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
16774
16775 EVT PtrVT = getPointerTy(DAG.getDataLayout());
16776 SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
16777 PtrVT, 0);
16778
16779 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
16780 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
16781 if (Subtarget->hasCustomCallingConv())
16782 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
16783
16784 Size = DAG.getNode(ISD::SRL, DL, MVT::i64, Size,
16785 DAG.getConstant(4, DL, MVT::i64));
16786 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X15, Size, SDValue());
16787 Chain =
16788 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
16789 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
16790 DAG.getRegisterMask(Mask), Chain.getValue(1));
16791 // To match the actual intent better, we should read the output from X15 here
16792 // again (instead of potentially spilling it to the stack), but rereading Size
16793 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
16794 // here.
16795
16796 Size = DAG.getNode(ISD::SHL, DL, MVT::i64, Size,
16797 DAG.getConstant(4, DL, MVT::i64));
16798
16799 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16800 Chain = SP.getValue(1);
16801 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16802 if (Align)
16803 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16804 DAG.getSignedConstant(-Align->value(), DL, VT));
16805 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
16806
16807 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL);
16808
16809 SDValue Ops[2] = {SP, Chain};
16810 return DAG.getMergeValues(Ops, DL);
16811}
16812
16813SDValue
16814AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
16815 SelectionDAG &DAG) const {
16816 // Get the inputs.
16817 SDNode *Node = Op.getNode();
16818 SDValue Chain = Op.getOperand(0);
16819 SDValue Size = Op.getOperand(1);
16820
16821 MaybeAlign Align =
16822 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16823 SDLoc DL(Op);
16824 EVT VT = Node->getValueType(0);
16825
16826 // Construct the new SP value in a GPR.
16827 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16828 Chain = SP.getValue(1);
16829 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16830 if (Align)
16831 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16832 DAG.getSignedConstant(-Align->value(), DL, VT));
16833
16834 // Set the real SP to the new value with a probing loop.
16835 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, DL, MVT::Other, Chain, SP);
16836 SDValue Ops[2] = {SP, Chain};
16837 return DAG.getMergeValues(Ops, DL);
16838}
16839
16840SDValue
16841AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16842 SelectionDAG &DAG) const {
16843 MachineFunction &MF = DAG.getMachineFunction();
16844
16845 if (Subtarget->isTargetWindows())
16846 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
16847 else if (hasInlineStackProbe(MF))
16848 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
16849 else
16850 return SDValue();
16851}
16852
16853SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
16854 unsigned NewOp) const {
16855 if (Subtarget->hasSVE2())
16856 return LowerToPredicatedOp(Op, DAG, NewOp);
16857
16858 // Default to expand.
16859 return SDValue();
16860}
16861
16862SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
16863 SelectionDAG &DAG) const {
16864 EVT VT = Op.getValueType();
16865 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
16866
16867 SDLoc DL(Op);
16868 APInt MulImm = Op.getConstantOperandAPInt(0);
16869 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
16870 VT);
16871}
16872
16873/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
16874template <unsigned NumVecs>
16875static bool
16879 // Retrieve EC from first vector argument.
16880 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
16882#ifndef NDEBUG
16883 // Check the assumption that all input vectors are the same type.
16884 for (unsigned I = 0; I < NumVecs; ++I)
16885 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
16886 "Invalid type.");
16887#endif
16888 // memVT is `NumVecs * VT`.
16890 EC * NumVecs);
16891 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
16892 Info.offset = 0;
16893 Info.align.reset();
16895 return true;
16896}
16897
16898/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
16899/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
16900/// specified in the intrinsic calls.
16902 const CallInst &I,
16903 MachineFunction &MF,
16904 unsigned Intrinsic) const {
16905 auto &DL = I.getDataLayout();
16906 switch (Intrinsic) {
16907 case Intrinsic::aarch64_sve_st2:
16908 return setInfoSVEStN<2>(*this, DL, Info, I);
16909 case Intrinsic::aarch64_sve_st3:
16910 return setInfoSVEStN<3>(*this, DL, Info, I);
16911 case Intrinsic::aarch64_sve_st4:
16912 return setInfoSVEStN<4>(*this, DL, Info, I);
16913 case Intrinsic::aarch64_neon_ld2:
16914 case Intrinsic::aarch64_neon_ld3:
16915 case Intrinsic::aarch64_neon_ld4:
16916 case Intrinsic::aarch64_neon_ld1x2:
16917 case Intrinsic::aarch64_neon_ld1x3:
16918 case Intrinsic::aarch64_neon_ld1x4: {
16919 Info.opc = ISD::INTRINSIC_W_CHAIN;
16920 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
16921 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16922 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16923 Info.offset = 0;
16924 Info.align.reset();
16925 // volatile loads with NEON intrinsics not supported
16926 Info.flags = MachineMemOperand::MOLoad;
16927 return true;
16928 }
16929 case Intrinsic::aarch64_neon_ld2lane:
16930 case Intrinsic::aarch64_neon_ld3lane:
16931 case Intrinsic::aarch64_neon_ld4lane:
16932 case Intrinsic::aarch64_neon_ld2r:
16933 case Intrinsic::aarch64_neon_ld3r:
16934 case Intrinsic::aarch64_neon_ld4r: {
16935 Info.opc = ISD::INTRINSIC_W_CHAIN;
16936 // ldx return struct with the same vec type
16937 Type *RetTy = I.getType();
16938 auto *StructTy = cast<StructType>(RetTy);
16939 unsigned NumElts = StructTy->getNumElements();
16940 Type *VecTy = StructTy->getElementType(0);
16941 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
16942 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
16943 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16944 Info.offset = 0;
16945 Info.align.reset();
16946 // volatile loads with NEON intrinsics not supported
16947 Info.flags = MachineMemOperand::MOLoad;
16948 return true;
16949 }
16950 case Intrinsic::aarch64_neon_st2:
16951 case Intrinsic::aarch64_neon_st3:
16952 case Intrinsic::aarch64_neon_st4:
16953 case Intrinsic::aarch64_neon_st1x2:
16954 case Intrinsic::aarch64_neon_st1x3:
16955 case Intrinsic::aarch64_neon_st1x4: {
16956 Info.opc = ISD::INTRINSIC_VOID;
16957 unsigned NumElts = 0;
16958 for (const Value *Arg : I.args()) {
16959 Type *ArgTy = Arg->getType();
16960 if (!ArgTy->isVectorTy())
16961 break;
16962 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
16963 }
16964 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16965 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16966 Info.offset = 0;
16967 Info.align.reset();
16968 // volatile stores with NEON intrinsics not supported
16969 Info.flags = MachineMemOperand::MOStore;
16970 return true;
16971 }
16972 case Intrinsic::aarch64_neon_st2lane:
16973 case Intrinsic::aarch64_neon_st3lane:
16974 case Intrinsic::aarch64_neon_st4lane: {
16975 Info.opc = ISD::INTRINSIC_VOID;
16976 unsigned NumElts = 0;
16977 // all the vector type is same
16978 Type *VecTy = I.getArgOperand(0)->getType();
16979 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
16980
16981 for (const Value *Arg : I.args()) {
16982 Type *ArgTy = Arg->getType();
16983 if (!ArgTy->isVectorTy())
16984 break;
16985 NumElts += 1;
16986 }
16987
16988 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
16989 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
16990 Info.offset = 0;
16991 Info.align.reset();
16992 // volatile stores with NEON intrinsics not supported
16993 Info.flags = MachineMemOperand::MOStore;
16994 return true;
16995 }
16996 case Intrinsic::aarch64_ldaxr:
16997 case Intrinsic::aarch64_ldxr: {
16998 Type *ValTy = I.getParamElementType(0);
16999 Info.opc = ISD::INTRINSIC_W_CHAIN;
17000 Info.memVT = MVT::getVT(ValTy);
17001 Info.ptrVal = I.getArgOperand(0);
17002 Info.offset = 0;
17003 Info.align = DL.getABITypeAlign(ValTy);
17005 return true;
17006 }
17007 case Intrinsic::aarch64_stlxr:
17008 case Intrinsic::aarch64_stxr: {
17009 Type *ValTy = I.getParamElementType(1);
17010 Info.opc = ISD::INTRINSIC_W_CHAIN;
17011 Info.memVT = MVT::getVT(ValTy);
17012 Info.ptrVal = I.getArgOperand(1);
17013 Info.offset = 0;
17014 Info.align = DL.getABITypeAlign(ValTy);
17016 return true;
17017 }
17018 case Intrinsic::aarch64_ldaxp:
17019 case Intrinsic::aarch64_ldxp:
17020 Info.opc = ISD::INTRINSIC_W_CHAIN;
17021 Info.memVT = MVT::i128;
17022 Info.ptrVal = I.getArgOperand(0);
17023 Info.offset = 0;
17024 Info.align = Align(16);
17026 return true;
17027 case Intrinsic::aarch64_stlxp:
17028 case Intrinsic::aarch64_stxp:
17029 Info.opc = ISD::INTRINSIC_W_CHAIN;
17030 Info.memVT = MVT::i128;
17031 Info.ptrVal = I.getArgOperand(2);
17032 Info.offset = 0;
17033 Info.align = Align(16);
17035 return true;
17036 case Intrinsic::aarch64_sve_ldnt1: {
17037 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
17038 Info.opc = ISD::INTRINSIC_W_CHAIN;
17039 Info.memVT = MVT::getVT(I.getType());
17040 Info.ptrVal = I.getArgOperand(1);
17041 Info.offset = 0;
17042 Info.align = DL.getABITypeAlign(ElTy);
17044 return true;
17045 }
17046 case Intrinsic::aarch64_sve_stnt1: {
17047 Type *ElTy =
17048 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
17049 Info.opc = ISD::INTRINSIC_W_CHAIN;
17050 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
17051 Info.ptrVal = I.getArgOperand(2);
17052 Info.offset = 0;
17053 Info.align = DL.getABITypeAlign(ElTy);
17055 return true;
17056 }
17057 case Intrinsic::aarch64_mops_memset_tag: {
17058 Value *Dst = I.getArgOperand(0);
17059 Value *Val = I.getArgOperand(1);
17060 Info.opc = ISD::INTRINSIC_W_CHAIN;
17061 Info.memVT = MVT::getVT(Val->getType());
17062 Info.ptrVal = Dst;
17063 Info.offset = 0;
17064 Info.align = I.getParamAlign(0).valueOrOne();
17065 Info.flags = MachineMemOperand::MOStore;
17066 // The size of the memory being operated on is unknown at this point
17067 Info.size = MemoryLocation::UnknownSize;
17068 return true;
17069 }
17070 default:
17071 break;
17072 }
17073
17074 return false;
17075}
17076
17078 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
17079 std::optional<unsigned> ByteOffset) const {
17080 // TODO: This may be worth removing. Check regression tests for diffs.
17081 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT,
17082 ByteOffset))
17083 return false;
17084
17085 // If we're reducing the load width in order to avoid having to use an extra
17086 // instruction to do extension then it's probably a good idea.
17087 if (ExtTy != ISD::NON_EXTLOAD)
17088 return true;
17089 // Don't reduce load width if it would prevent us from combining a shift into
17090 // the offset.
17091 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
17092 assert(Mem);
17093 const SDValue &Base = Mem->getBasePtr();
17094 if (Base.getOpcode() == ISD::ADD &&
17095 Base.getOperand(1).getOpcode() == ISD::SHL &&
17096 Base.getOperand(1).hasOneUse() &&
17097 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
17098 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
17099 if (Mem->getMemoryVT().isScalableVector())
17100 return false;
17101 // The shift can be combined if it matches the size of the value being
17102 // loaded (and so reducing the width would make it not match).
17103 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
17104 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
17105 if (ShiftAmount == Log2_32(LoadBytes))
17106 return false;
17107 }
17108 // We have no reason to disallow reducing the load width, so allow it.
17109 return true;
17110}
17111
17112// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
17114 EVT VT = Extend.getValueType();
17115 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
17116 SDValue Extract = Extend.getOperand(0);
17117 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
17118 Extract = Extract.getOperand(0);
17119 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
17120 EVT VecVT = Extract.getOperand(0).getValueType();
17121 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
17122 return false;
17123 }
17124 }
17125 return true;
17126}
17127
17128// Truncations from 64-bit GPR to 32-bit GPR is free.
17130 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17131 return false;
17132 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
17133 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
17134 return NumBits1 > NumBits2;
17135}
17137 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17138 return false;
17139 uint64_t NumBits1 = VT1.getFixedSizeInBits();
17140 uint64_t NumBits2 = VT2.getFixedSizeInBits();
17141 return NumBits1 > NumBits2;
17142}
17143
17144/// Check if it is profitable to hoist instruction in then/else to if.
17145/// Not profitable if I and it's user can form a FMA instruction
17146/// because we prefer FMSUB/FMADD.
17148 if (I->getOpcode() != Instruction::FMul)
17149 return true;
17150
17151 if (!I->hasOneUse())
17152 return true;
17153
17154 Instruction *User = I->user_back();
17155
17156 if (!(User->getOpcode() == Instruction::FSub ||
17157 User->getOpcode() == Instruction::FAdd))
17158 return true;
17159
17161 const Function *F = I->getFunction();
17162 const DataLayout &DL = F->getDataLayout();
17163 Type *Ty = User->getOperand(0)->getType();
17164
17165 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
17167 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17168 I->getFastMathFlags().allowContract()));
17169}
17170
17171// All 32-bit GPR operations implicitly zero the high-half of the corresponding
17172// 64-bit GPR.
17174 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17175 return false;
17176 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
17177 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
17178 return NumBits1 == 32 && NumBits2 == 64;
17179}
17181 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17182 return false;
17183 unsigned NumBits1 = VT1.getSizeInBits();
17184 unsigned NumBits2 = VT2.getSizeInBits();
17185 return NumBits1 == 32 && NumBits2 == 64;
17186}
17187
17189 EVT VT1 = Val.getValueType();
17190 if (isZExtFree(VT1, VT2)) {
17191 return true;
17192 }
17193
17194 if (Val.getOpcode() != ISD::LOAD)
17195 return false;
17196
17197 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
17198 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
17199 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
17200 VT1.getSizeInBits() <= 32);
17201}
17202
17203bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
17204 if (isa<FPExtInst>(Ext))
17205 return false;
17206
17207 // Vector types are not free.
17208 if (Ext->getType()->isVectorTy())
17209 return false;
17210
17211 for (const Use &U : Ext->uses()) {
17212 // The extension is free if we can fold it with a left shift in an
17213 // addressing mode or an arithmetic operation: add, sub, and cmp.
17214
17215 // Is there a shift?
17216 const Instruction *Instr = cast<Instruction>(U.getUser());
17217
17218 // Is this a constant shift?
17219 switch (Instr->getOpcode()) {
17220 case Instruction::Shl:
17221 if (!isa<ConstantInt>(Instr->getOperand(1)))
17222 return false;
17223 break;
17224 case Instruction::GetElementPtr: {
17225 gep_type_iterator GTI = gep_type_begin(Instr);
17226 auto &DL = Ext->getDataLayout();
17227 std::advance(GTI, U.getOperandNo()-1);
17228 Type *IdxTy = GTI.getIndexedType();
17229 // This extension will end up with a shift because of the scaling factor.
17230 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
17231 // Get the shift amount based on the scaling factor:
17232 // log2(sizeof(IdxTy)) - log2(8).
17233 if (IdxTy->isScalableTy())
17234 return false;
17235 uint64_t ShiftAmt =
17236 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
17237 3;
17238 // Is the constant foldable in the shift of the addressing mode?
17239 // I.e., shift amount is between 1 and 4 inclusive.
17240 if (ShiftAmt == 0 || ShiftAmt > 4)
17241 return false;
17242 break;
17243 }
17244 case Instruction::Trunc:
17245 // Check if this is a noop.
17246 // trunc(sext ty1 to ty2) to ty1.
17247 if (Instr->getType() == Ext->getOperand(0)->getType())
17248 continue;
17249 [[fallthrough]];
17250 default:
17251 return false;
17252 }
17253
17254 // At this point we can use the bfm family, so this extension is free
17255 // for that use.
17256 }
17257 return true;
17258}
17259
17260static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
17261 unsigned NumElts, bool IsLittleEndian,
17262 SmallVectorImpl<int> &Mask) {
17263 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
17264 return false;
17265
17266 assert(DstWidth % SrcWidth == 0 &&
17267 "TBL lowering is not supported for a conversion instruction with this "
17268 "source and destination element type.");
17269
17270 unsigned Factor = DstWidth / SrcWidth;
17271 unsigned MaskLen = NumElts * Factor;
17272
17273 Mask.clear();
17274 Mask.resize(MaskLen, NumElts);
17275
17276 unsigned SrcIndex = 0;
17277 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
17278 Mask[I] = SrcIndex++;
17279
17280 return true;
17281}
17282
17284 FixedVectorType *ZExtTy,
17285 FixedVectorType *DstTy,
17286 bool IsLittleEndian) {
17287 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17288 unsigned NumElts = SrcTy->getNumElements();
17289 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17290 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17291
17292 SmallVector<int> Mask;
17293 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
17294 return nullptr;
17295
17296 auto *FirstEltZero = Builder.CreateInsertElement(
17297 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17298 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17299 Result = Builder.CreateBitCast(Result, DstTy);
17300 if (DstTy != ZExtTy)
17301 Result = Builder.CreateZExt(Result, ZExtTy);
17302 return Result;
17303}
17304
17306 FixedVectorType *DstTy,
17307 bool IsLittleEndian) {
17308 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17309 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17310 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17311
17312 SmallVector<int> Mask;
17313 if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
17314 !IsLittleEndian, Mask))
17315 return nullptr;
17316
17317 auto *FirstEltZero = Builder.CreateInsertElement(
17318 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17319
17320 return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17321}
17322
17323static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
17324 IRBuilder<> Builder(TI);
17326 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
17327 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
17328 auto *DstTy = cast<FixedVectorType>(TI->getType());
17329 assert(SrcTy->getElementType()->isIntegerTy() &&
17330 "Non-integer type source vector element is not supported");
17331 assert(DstTy->getElementType()->isIntegerTy(8) &&
17332 "Unsupported destination vector element type");
17333 unsigned SrcElemTySz =
17334 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17335 unsigned DstElemTySz =
17336 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17337 assert((SrcElemTySz % DstElemTySz == 0) &&
17338 "Cannot lower truncate to tbl instructions for a source element size "
17339 "that is not divisible by the destination element size");
17340 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
17341 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
17342 "Unsupported source vector element type size");
17343 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
17344
17345 // Create a mask to choose every nth byte from the source vector table of
17346 // bytes to create the truncated destination vector, where 'n' is the truncate
17347 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
17348 // 0,8,16,..Y*8th bytes for the little-endian format
17350 for (int Itr = 0; Itr < 16; Itr++) {
17351 if (Itr < NumElements)
17352 MaskConst.push_back(Builder.getInt8(
17353 IsLittleEndian ? Itr * TruncFactor
17354 : Itr * TruncFactor + (TruncFactor - 1)));
17355 else
17356 MaskConst.push_back(Builder.getInt8(255));
17357 }
17358
17359 int MaxTblSz = 128 * 4;
17360 int MaxSrcSz = SrcElemTySz * NumElements;
17361 int ElemsPerTbl =
17362 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
17363 assert(ElemsPerTbl <= 16 &&
17364 "Maximum elements selected using TBL instruction cannot exceed 16!");
17365
17366 int ShuffleCount = 128 / SrcElemTySz;
17367 SmallVector<int> ShuffleLanes;
17368 for (int i = 0; i < ShuffleCount; ++i)
17369 ShuffleLanes.push_back(i);
17370
17371 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
17372 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
17373 // call TBL & save the result in a vector of TBL results for combining later.
17375 while (ShuffleLanes.back() < NumElements) {
17376 Parts.push_back(Builder.CreateBitCast(
17377 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
17378
17379 if (Parts.size() == 4) {
17380 Parts.push_back(ConstantVector::get(MaskConst));
17381 Results.push_back(
17382 Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
17383 Parts.clear();
17384 }
17385
17386 for (int i = 0; i < ShuffleCount; ++i)
17387 ShuffleLanes[i] += ShuffleCount;
17388 }
17389
17390 assert((Parts.empty() || Results.empty()) &&
17391 "Lowering trunc for vectors requiring different TBL instructions is "
17392 "not supported!");
17393 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
17394 // registers
17395 if (!Parts.empty()) {
17396 Intrinsic::ID TblID;
17397 switch (Parts.size()) {
17398 case 1:
17399 TblID = Intrinsic::aarch64_neon_tbl1;
17400 break;
17401 case 2:
17402 TblID = Intrinsic::aarch64_neon_tbl2;
17403 break;
17404 case 3:
17405 TblID = Intrinsic::aarch64_neon_tbl3;
17406 break;
17407 }
17408
17409 Parts.push_back(ConstantVector::get(MaskConst));
17410 Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
17411 }
17412
17413 // Extract the destination vector from TBL result(s) after combining them
17414 // where applicable. Currently, at most two TBLs are supported.
17415 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
17416 "more than 2 tbl instructions!");
17417 Value *FinalResult = Results[0];
17418 if (Results.size() == 1) {
17419 if (ElemsPerTbl < 16) {
17420 SmallVector<int> FinalMask(ElemsPerTbl);
17421 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17422 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
17423 }
17424 } else {
17425 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
17426 if (ElemsPerTbl < 16) {
17427 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
17428 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
17429 } else {
17430 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17431 }
17432 FinalResult =
17433 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
17434 }
17435
17436 TI->replaceAllUsesWith(FinalResult);
17437 TI->eraseFromParent();
17438}
17439
17441 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
17442 // shuffle_vector instructions are serialized when targeting SVE,
17443 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
17444 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
17445 return false;
17446
17447 // Try to optimize conversions using tbl. This requires materializing constant
17448 // index vectors, which can increase code size and add loads. Skip the
17449 // transform unless the conversion is in a loop block guaranteed to execute
17450 // and we are not optimizing for size.
17451 Function *F = I->getParent()->getParent();
17452 if (!L || L->getHeader() != I->getParent() || F->hasOptSize())
17453 return false;
17454
17455 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
17456 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
17457 if (!SrcTy || !DstTy)
17458 return false;
17459
17460 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
17461 // lowered to tbl instructions to insert the original i8 elements
17462 // into i8x lanes. This is enabled for cases where it is beneficial.
17463 auto *ZExt = dyn_cast<ZExtInst>(I);
17464 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
17465 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
17466 if (DstWidth % 8 != 0)
17467 return false;
17468
17469 auto *TruncDstType =
17471 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
17472 // the remaining ZExt folded into the user, don't use tbl lowering.
17473 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
17474 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
17477 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
17478 return false;
17479
17480 DstTy = TruncDstType;
17481 }
17482
17483 // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
17484 // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
17485 // most one extra extend step is needed and using tbl is not profitable.
17486 // Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a
17487 // udot instruction.
17488 if (SrcWidth * 4 <= DstWidth) {
17489 if (all_of(I->users(), [&](auto *U) {
17490 auto *SingleUser = cast<Instruction>(&*U);
17491 if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
17492 return true;
17493 if (match(SingleUser,
17494 m_Intrinsic<Intrinsic::vector_partial_reduce_add>(
17495 m_Value(), m_Specific(I))))
17496 return true;
17497 return false;
17498 }))
17499 return false;
17500 }
17501
17502 if (DstTy->getScalarSizeInBits() >= 64)
17503 return false;
17504
17505 IRBuilder<> Builder(ZExt);
17507 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
17508 DstTy, Subtarget->isLittleEndian());
17509 if (!Result)
17510 return false;
17511 ZExt->replaceAllUsesWith(Result);
17512 ZExt->eraseFromParent();
17513 return true;
17514 }
17515
17516 auto *UIToFP = dyn_cast<UIToFPInst>(I);
17517 if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
17518 DstTy->getElementType()->isFloatTy()) ||
17519 (SrcTy->getElementType()->isIntegerTy(16) &&
17520 DstTy->getElementType()->isDoubleTy()))) {
17521 IRBuilder<> Builder(I);
17523 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
17524 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
17525 assert(ZExt && "Cannot fail for the i8 to float conversion");
17526 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
17527 I->replaceAllUsesWith(UI);
17528 I->eraseFromParent();
17529 return true;
17530 }
17531
17532 auto *SIToFP = dyn_cast<SIToFPInst>(I);
17533 if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
17534 DstTy->getElementType()->isFloatTy()) {
17535 IRBuilder<> Builder(I);
17536 auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
17538 Subtarget->isLittleEndian());
17539 assert(Shuffle && "Cannot fail for the i8 to float conversion");
17540 auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
17541 auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
17542 auto *SI = Builder.CreateSIToFP(AShr, DstTy);
17543 I->replaceAllUsesWith(SI);
17544 I->eraseFromParent();
17545 return true;
17546 }
17547
17548 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
17549 // followed by a truncate lowered to using tbl.4.
17550 auto *FPToUI = dyn_cast<FPToUIInst>(I);
17551 if (FPToUI &&
17552 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
17553 SrcTy->getElementType()->isFloatTy() &&
17554 DstTy->getElementType()->isIntegerTy(8)) {
17555 IRBuilder<> Builder(I);
17556 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
17557 VectorType::getInteger(SrcTy));
17558 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
17559 I->replaceAllUsesWith(TruncI);
17560 I->eraseFromParent();
17561 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
17562 return true;
17563 }
17564
17565 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
17566 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
17567 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
17568 // registers
17569 auto *TI = dyn_cast<TruncInst>(I);
17570 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
17571 ((SrcTy->getElementType()->isIntegerTy(32) ||
17572 SrcTy->getElementType()->isIntegerTy(64)) &&
17573 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
17574 createTblForTrunc(TI, Subtarget->isLittleEndian());
17575 return true;
17576 }
17577
17578 return false;
17579}
17580
17582 Align &RequiredAlignment) const {
17583 if (!LoadedType.isSimple() ||
17584 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
17585 return false;
17586 // Cyclone supports unaligned accesses.
17587 RequiredAlignment = Align(1);
17588 unsigned NumBits = LoadedType.getSizeInBits();
17589 return NumBits == 32 || NumBits == 64;
17590}
17591
17592/// A helper function for determining the number of interleaved accesses we
17593/// will generate when lowering accesses of the given type.
17595 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
17596 unsigned VecSize = 128;
17597 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17598 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
17599 if (UseScalable && isa<FixedVectorType>(VecTy))
17600 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17601 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
17602}
17603
17606 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
17607 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
17608 return MOStridedAccess;
17610}
17611
17613 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
17614 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17615 auto EC = VecTy->getElementCount();
17616 unsigned MinElts = EC.getKnownMinValue();
17617
17618 UseScalable = false;
17619
17620 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
17621 (!Subtarget->useSVEForFixedLengthVectors() ||
17623 return false;
17624
17625 if (isa<ScalableVectorType>(VecTy) &&
17626 !Subtarget->isSVEorStreamingSVEAvailable())
17627 return false;
17628
17629 // Ensure the number of vector elements is greater than 1.
17630 if (MinElts < 2)
17631 return false;
17632
17633 // Ensure the element type is legal.
17634 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
17635 return false;
17636
17637 if (EC.isScalable()) {
17638 UseScalable = true;
17639 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
17640 }
17641
17642 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
17643 if (Subtarget->useSVEForFixedLengthVectors()) {
17644 unsigned MinSVEVectorSize =
17645 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17646 if (VecSize % MinSVEVectorSize == 0 ||
17647 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
17648 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
17649 UseScalable = true;
17650 return true;
17651 }
17652 }
17653
17654 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
17655 // 128 will be split into multiple interleaved accesses.
17656 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
17657}
17658
17660 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
17661 return ScalableVectorType::get(VTy->getElementType(), 2);
17662
17663 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
17664 return ScalableVectorType::get(VTy->getElementType(), 4);
17665
17666 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
17667 return ScalableVectorType::get(VTy->getElementType(), 8);
17668
17669 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
17670 return ScalableVectorType::get(VTy->getElementType(), 8);
17671
17672 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
17673 return ScalableVectorType::get(VTy->getElementType(), 2);
17674
17675 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
17676 return ScalableVectorType::get(VTy->getElementType(), 4);
17677
17678 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
17679 return ScalableVectorType::get(VTy->getElementType(), 8);
17680
17681 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
17682 return ScalableVectorType::get(VTy->getElementType(), 16);
17683
17684 llvm_unreachable("Cannot handle input vector type");
17685}
17686
17687static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
17688 bool Scalable, Type *LDVTy,
17689 Type *PtrTy) {
17690 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17691 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
17692 Intrinsic::aarch64_sve_ld3_sret,
17693 Intrinsic::aarch64_sve_ld4_sret};
17694 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
17695 Intrinsic::aarch64_neon_ld3,
17696 Intrinsic::aarch64_neon_ld4};
17697 if (Scalable)
17698 return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy});
17699
17700 return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
17701 {LDVTy, PtrTy});
17702}
17703
17704static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
17705 bool Scalable, Type *STVTy,
17706 Type *PtrTy) {
17707 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17708 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
17709 Intrinsic::aarch64_sve_st3,
17710 Intrinsic::aarch64_sve_st4};
17711 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
17712 Intrinsic::aarch64_neon_st3,
17713 Intrinsic::aarch64_neon_st4};
17714 if (Scalable)
17715 return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy});
17716
17717 return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
17718 {STVTy, PtrTy});
17719}
17720
17721/// Lower an interleaved load into a ldN intrinsic.
17722///
17723/// E.g. Lower an interleaved load (Factor = 2):
17724/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
17725/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
17726/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
17727///
17728/// Into:
17729/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
17730/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
17731/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
17733 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
17734 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
17735 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17736 "Invalid interleave factor");
17737 assert(!Shuffles.empty() && "Empty shufflevector input");
17738 assert(Shuffles.size() == Indices.size() &&
17739 "Unmatched number of shufflevectors and indices");
17740
17741 auto *LI = dyn_cast<LoadInst>(Load);
17742 if (!LI)
17743 return false;
17744 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
17745
17746 const DataLayout &DL = LI->getDataLayout();
17747
17748 VectorType *VTy = Shuffles[0]->getType();
17749
17750 // Skip if we do not have NEON and skip illegal vector types. We can
17751 // "legalize" wide vector types into multiple interleaved accesses as long as
17752 // the vector types are divisible by 128.
17753 bool UseScalable;
17754 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17755 return false;
17756
17757 // Check if the interleave is a zext(shuffle), that can be better optimized
17758 // into shift / and masks. For the moment we do this just for uitofp (not
17759 // zext) to avoid issues with widening instructions.
17760 if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
17761 return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
17762 SI->getType()->getScalarSizeInBits() * 4 ==
17763 SI->user_back()->getType()->getScalarSizeInBits();
17764 }))
17765 return false;
17766
17767 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17768
17769 auto *FVTy = cast<FixedVectorType>(VTy);
17770
17771 // A pointer vector can not be the return type of the ldN intrinsics. Need to
17772 // load integer vectors first and then convert to pointer vectors.
17773 Type *EltTy = FVTy->getElementType();
17774 if (EltTy->isPointerTy())
17775 FVTy =
17776 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
17777
17778 // If we're going to generate more than one load, reset the sub-vector type
17779 // to something legal.
17780 FVTy = FixedVectorType::get(FVTy->getElementType(),
17781 FVTy->getNumElements() / NumLoads);
17782
17783 auto *LDVTy =
17784 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
17785
17786 IRBuilder<> Builder(LI);
17787
17788 // The base address of the load.
17789 Value *BaseAddr = LI->getPointerOperand();
17790
17791 Type *PtrTy = LI->getPointerOperandType();
17792 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
17793 LDVTy->getElementCount());
17794
17795 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
17796 UseScalable, LDVTy, PtrTy);
17797
17798 // Holds sub-vectors extracted from the load intrinsic return values. The
17799 // sub-vectors are associated with the shufflevector instructions they will
17800 // replace.
17802
17803 Value *PTrue = nullptr;
17804 if (UseScalable) {
17805 std::optional<unsigned> PgPattern =
17806 getSVEPredPatternFromNumElements(FVTy->getNumElements());
17807 if (Subtarget->getMinSVEVectorSizeInBits() ==
17808 Subtarget->getMaxSVEVectorSizeInBits() &&
17809 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
17810 PgPattern = AArch64SVEPredPattern::all;
17811
17812 auto *PTruePat =
17813 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
17814 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17815 {PTruePat});
17816 }
17817
17818 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
17819
17820 // If we're generating more than one load, compute the base address of
17821 // subsequent loads as an offset from the previous.
17822 if (LoadCount > 0)
17823 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
17824 FVTy->getNumElements() * Factor);
17825
17826 CallInst *LdN;
17827 if (UseScalable)
17828 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
17829 else
17830 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17831
17832 // Extract and store the sub-vectors returned by the load intrinsic.
17833 for (unsigned i = 0; i < Shuffles.size(); i++) {
17834 ShuffleVectorInst *SVI = Shuffles[i];
17835 unsigned Index = Indices[i];
17836
17837 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
17838
17839 if (UseScalable)
17840 SubVec = Builder.CreateExtractVector(FVTy, SubVec, uint64_t(0));
17841
17842 // Convert the integer vector to pointer vector if the element is pointer.
17843 if (EltTy->isPointerTy())
17844 SubVec = Builder.CreateIntToPtr(
17846 FVTy->getNumElements()));
17847
17848 SubVecs[SVI].push_back(SubVec);
17849 }
17850 }
17851
17852 // Replace uses of the shufflevector instructions with the sub-vectors
17853 // returned by the load intrinsic. If a shufflevector instruction is
17854 // associated with more than one sub-vector, those sub-vectors will be
17855 // concatenated into a single wide vector.
17856 for (ShuffleVectorInst *SVI : Shuffles) {
17857 auto &SubVec = SubVecs[SVI];
17858 auto *WideVec =
17859 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
17860 SVI->replaceAllUsesWith(WideVec);
17861 }
17862
17863 return true;
17864}
17865
17866template <typename Iter>
17867bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
17868 int MaxLookupDist = 20;
17869 unsigned IdxWidth = DL.getIndexSizeInBits(0);
17870 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
17871 const Value *PtrA1 =
17872 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
17873
17874 while (++It != End) {
17875 if (It->isDebugOrPseudoInst())
17876 continue;
17877 if (MaxLookupDist-- == 0)
17878 break;
17879 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
17880 const Value *PtrB1 =
17881 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
17882 DL, OffsetB);
17883 if (PtrA1 == PtrB1 &&
17884 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
17885 .abs() == 16)
17886 return true;
17887 }
17888 }
17889
17890 return false;
17891}
17892
17893/// Lower an interleaved store into a stN intrinsic.
17894///
17895/// E.g. Lower an interleaved store (Factor = 3):
17896/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
17897/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
17898/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17899///
17900/// Into:
17901/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
17902/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
17903/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
17904/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17905///
17906/// Note that the new shufflevectors will be removed and we'll only generate one
17907/// st3 instruction in CodeGen.
17908///
17909/// Example for a more general valid mask (Factor 3). Lower:
17910/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
17911/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
17912/// store <12 x i32> %i.vec, <12 x i32>* %ptr
17913///
17914/// Into:
17915/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
17916/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
17917/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
17918/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17920 Value *LaneMask,
17921 ShuffleVectorInst *SVI,
17922 unsigned Factor,
17923 const APInt &GapMask) const {
17924
17925 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17926 "Invalid interleave factor");
17927 auto *SI = dyn_cast<StoreInst>(Store);
17928 if (!SI)
17929 return false;
17930 assert(!LaneMask && GapMask.popcount() == Factor &&
17931 "Unexpected mask on store");
17932
17933 auto *VecTy = cast<FixedVectorType>(SVI->getType());
17934 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
17935
17936 unsigned LaneLen = VecTy->getNumElements() / Factor;
17937 Type *EltTy = VecTy->getElementType();
17938 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
17939
17940 const DataLayout &DL = SI->getDataLayout();
17941 bool UseScalable;
17942
17943 // Skip if we do not have NEON and skip illegal vector types. We can
17944 // "legalize" wide vector types into multiple interleaved accesses as long as
17945 // the vector types are divisible by 128.
17946 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
17947 return false;
17948
17949 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
17950
17951 Value *Op0 = SVI->getOperand(0);
17952 Value *Op1 = SVI->getOperand(1);
17953 IRBuilder<> Builder(SI);
17954
17955 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
17956 // vectors to integer vectors.
17957 if (EltTy->isPointerTy()) {
17958 Type *IntTy = DL.getIntPtrType(EltTy);
17959 unsigned NumOpElts =
17960 cast<FixedVectorType>(Op0->getType())->getNumElements();
17961
17962 // Convert to the corresponding integer vector.
17963 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
17964 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
17965 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
17966
17967 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
17968 }
17969
17970 // If we're going to generate more than one store, reset the lane length
17971 // and sub-vector type to something legal.
17972 LaneLen /= NumStores;
17973 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
17974
17975 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
17976 : SubVecTy;
17977
17978 // The base address of the store.
17979 Value *BaseAddr = SI->getPointerOperand();
17980
17981 auto Mask = SVI->getShuffleMask();
17982
17983 // Sanity check if all the indices are NOT in range.
17984 // If mask is `poison`, `Mask` may be a vector of -1s.
17985 // If all of them are `poison`, OOB read will happen later.
17986 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
17987 return false;
17988 }
17989 // A 64bit st2 which does not start at element 0 will involved adding extra
17990 // ext elements making the st2 unprofitable, and if there is a nearby store
17991 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
17992 // zip;ldp pair which has higher throughput.
17993 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
17994 (Mask[0] != 0 ||
17995 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
17996 DL) ||
17997 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
17998 BaseAddr, DL)))
17999 return false;
18000
18001 Type *PtrTy = SI->getPointerOperandType();
18002 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
18003 STVTy->getElementCount());
18004
18005 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18006 UseScalable, STVTy, PtrTy);
18007
18008 Value *PTrue = nullptr;
18009 if (UseScalable) {
18010 std::optional<unsigned> PgPattern =
18011 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
18012 if (Subtarget->getMinSVEVectorSizeInBits() ==
18013 Subtarget->getMaxSVEVectorSizeInBits() &&
18014 Subtarget->getMinSVEVectorSizeInBits() ==
18015 DL.getTypeSizeInBits(SubVecTy))
18016 PgPattern = AArch64SVEPredPattern::all;
18017
18018 auto *PTruePat =
18019 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
18020 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
18021 {PTruePat});
18022 }
18023
18024 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
18025
18027
18028 // Split the shufflevector operands into sub vectors for the new stN call.
18029 for (unsigned i = 0; i < Factor; i++) {
18030 Value *Shuffle;
18031 unsigned IdxI = StoreCount * LaneLen * Factor + i;
18032 if (Mask[IdxI] >= 0) {
18033 Shuffle = Builder.CreateShuffleVector(
18034 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
18035 } else {
18036 unsigned StartMask = 0;
18037 for (unsigned j = 1; j < LaneLen; j++) {
18038 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
18039 if (Mask[IdxJ] >= 0) {
18040 StartMask = Mask[IdxJ] - j;
18041 break;
18042 }
18043 }
18044 // Note: Filling undef gaps with random elements is ok, since
18045 // those elements were being written anyway (with undefs).
18046 // In the case of all undefs we're defaulting to using elems from 0
18047 // Note: StartMask cannot be negative, it's checked in
18048 // isReInterleaveMask
18049 Shuffle = Builder.CreateShuffleVector(
18050 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
18051 }
18052
18053 if (UseScalable)
18054 Shuffle = Builder.CreateInsertVector(STVTy, PoisonValue::get(STVTy),
18055 Shuffle, uint64_t(0));
18056
18057 Ops.push_back(Shuffle);
18058 }
18059
18060 if (UseScalable)
18061 Ops.push_back(PTrue);
18062
18063 // If we generating more than one store, we compute the base address of
18064 // subsequent stores as an offset from the previous.
18065 if (StoreCount > 0)
18066 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
18067 BaseAddr, LaneLen * Factor);
18068
18069 Ops.push_back(BaseAddr);
18070 Builder.CreateCall(StNFunc, Ops);
18071 }
18072 return true;
18073}
18074
18076 Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
18077 const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
18078 if (Factor != 2 && Factor != 4) {
18079 LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
18080 return false;
18081 }
18082 auto *LI = dyn_cast<LoadInst>(Load);
18083 if (!LI)
18084 return false;
18085 assert(!Mask && "Unexpected mask on a load\n");
18086
18088
18089 const DataLayout &DL = LI->getModule()->getDataLayout();
18090 bool UseScalable;
18091 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18092 return false;
18093
18094 // TODO: Add support for using SVE instructions with fixed types later, using
18095 // the code from lowerInterleavedLoad to obtain the correct container type.
18096 if (UseScalable && !VTy->isScalableTy())
18097 return false;
18098
18099 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
18100 VectorType *LdTy =
18102 VTy->getElementCount().divideCoefficientBy(NumLoads));
18103
18104 Type *PtrTy = LI->getPointerOperandType();
18105 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
18106 UseScalable, LdTy, PtrTy);
18107
18108 IRBuilder<> Builder(LI);
18109 Value *Pred = nullptr;
18110 if (UseScalable)
18111 Pred =
18112 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
18113
18114 Value *BaseAddr = LI->getPointerOperand();
18115 Value *Result = nullptr;
18116 if (NumLoads > 1) {
18117 // Create multiple legal small ldN.
18118 SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
18119 for (unsigned I = 0; I < NumLoads; ++I) {
18120 Value *Offset = Builder.getInt64(I * Factor);
18121
18122 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
18123 Value *LdN = nullptr;
18124 if (UseScalable)
18125 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
18126 else
18127 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
18128 Value *Idx =
18129 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
18130 for (unsigned J = 0; J < Factor; ++J) {
18131 ExtractedLdValues[J] = Builder.CreateInsertVector(
18132 VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
18133 }
18134 LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
18135 }
18136
18137 // Merge the values from different factors.
18138 Result = PoisonValue::get(DI->getType());
18139 for (unsigned J = 0; J < Factor; ++J)
18140 Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
18141 } else {
18142 if (UseScalable)
18143 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
18144 else
18145 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
18146 }
18147
18148 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
18149 DI->replaceAllUsesWith(Result);
18150 return true;
18151}
18152
18154 Instruction *Store, Value *Mask,
18155 ArrayRef<Value *> InterleavedValues) const {
18156 unsigned Factor = InterleavedValues.size();
18157 if (Factor != 2 && Factor != 4) {
18158 LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
18159 return false;
18160 }
18162 if (!SI)
18163 return false;
18164 assert(!Mask && "Unexpected mask on plain store");
18165
18166 VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
18167 const DataLayout &DL = SI->getModule()->getDataLayout();
18168
18169 bool UseScalable;
18170 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18171 return false;
18172
18173 // TODO: Add support for using SVE instructions with fixed types later, using
18174 // the code from lowerInterleavedStore to obtain the correct container type.
18175 if (UseScalable && !VTy->isScalableTy())
18176 return false;
18177
18178 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
18179
18180 VectorType *StTy =
18182 VTy->getElementCount().divideCoefficientBy(NumStores));
18183
18184 Type *PtrTy = SI->getPointerOperandType();
18185 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18186 UseScalable, StTy, PtrTy);
18187
18188 IRBuilder<> Builder(SI);
18189
18190 Value *BaseAddr = SI->getPointerOperand();
18191 Value *Pred = nullptr;
18192
18193 if (UseScalable)
18194 Pred =
18195 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
18196
18197 auto ExtractedValues = InterleavedValues;
18198 SmallVector<Value *, 4> StoreOperands(InterleavedValues);
18199 if (UseScalable)
18200 StoreOperands.push_back(Pred);
18201 StoreOperands.push_back(BaseAddr);
18202 for (unsigned I = 0; I < NumStores; ++I) {
18203 Value *Address = BaseAddr;
18204 if (NumStores > 1) {
18205 Value *Offset = Builder.getInt64(I * Factor);
18206 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
18207 Value *Idx =
18208 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
18209 for (unsigned J = 0; J < Factor; J++) {
18210 StoreOperands[J] =
18211 Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
18212 }
18213 // update the address
18214 StoreOperands[StoreOperands.size() - 1] = Address;
18215 }
18216 Builder.CreateCall(StNFunc, StoreOperands);
18217 }
18218 return true;
18219}
18220
18222 LLVMContext &Context, const MemOp &Op,
18223 const AttributeList &FuncAttributes) const {
18224 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18225 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18226 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18227 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18228 // taken one instruction to materialize the v2i64 zero and one store (with
18229 // restrictive addressing mode). Just do i64 stores.
18230 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18231 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18232 if (Op.isAligned(AlignCheck))
18233 return true;
18234 unsigned Fast;
18235 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18237 Fast;
18238 };
18239
18240 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18241 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
18242 return MVT::v16i8;
18243 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18244 return MVT::f128;
18245 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18246 return MVT::i64;
18247 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18248 return MVT::i32;
18249 return MVT::Other;
18250}
18251
18253 const MemOp &Op, const AttributeList &FuncAttributes) const {
18254 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18255 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18256 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18257 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18258 // taken one instruction to materialize the v2i64 zero and one store (with
18259 // restrictive addressing mode). Just do i64 stores.
18260 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18261 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18262 if (Op.isAligned(AlignCheck))
18263 return true;
18264 unsigned Fast;
18265 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18267 Fast;
18268 };
18269
18270 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18271 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
18272 return LLT::fixed_vector(2, 64);
18273 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18274 return LLT::scalar(128);
18275 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18276 return LLT::scalar(64);
18277 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18278 return LLT::scalar(32);
18279 return LLT();
18280}
18281
18282// 12-bit optionally shifted immediates are legal for adds.
18284 if (Immed == std::numeric_limits<int64_t>::min()) {
18285 return false;
18286 }
18287 // Same encoding for add/sub, just flip the sign.
18288 return isLegalArithImmed((uint64_t)std::abs(Immed));
18289}
18290
18292 // We will only emit addvl/inc* instructions for SVE2
18293 if (!Subtarget->hasSVE2())
18294 return false;
18295
18296 // addvl's immediates are in terms of the number of bytes in a register.
18297 // Since there are 16 in the base supported size (128bits), we need to
18298 // divide the immediate by that much to give us a useful immediate to
18299 // multiply by vscale. We can't have a remainder as a result of this.
18300 if (Imm % 16 == 0)
18301 return isInt<6>(Imm / 16);
18302
18303 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
18304 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
18305 // of addvl as a result, so only take h|w|d into account.
18306 // Dec[h|w|d] will cover subtractions.
18307 // Immediates are in the range [1,16], so we can't do a 2's complement check.
18308 // FIXME: Can we make use of other patterns to cover other immediates?
18309
18310 // inch|dech
18311 if (Imm % 8 == 0)
18312 return std::abs(Imm / 8) <= 16;
18313 // incw|decw
18314 if (Imm % 4 == 0)
18315 return std::abs(Imm / 4) <= 16;
18316 // incd|decd
18317 if (Imm % 2 == 0)
18318 return std::abs(Imm / 2) <= 16;
18319
18320 return false;
18321}
18322
18323// Return false to prevent folding
18324// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
18325// if the folding leads to worse code.
18327 SDValue AddNode, SDValue ConstNode) const {
18328 // Let the DAGCombiner decide for vector types and large types.
18329 const EVT VT = AddNode.getValueType();
18330 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
18331 return true;
18332
18333 // It is worse if c1 is legal add immediate, while c1*c2 is not
18334 // and has to be composed by at least two instructions.
18335 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
18336 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
18337 const int64_t C1 = C1Node->getSExtValue();
18338 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
18340 return true;
18342 // Adapt to the width of a register.
18343 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
18344 AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), BitSize, Insn);
18345 if (Insn.size() > 1)
18346 return false;
18347
18348 // Default to true and let the DAGCombiner decide.
18349 return true;
18350}
18351
18352// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
18353// immediates is the same as for an add or a sub.
18355 return isLegalAddImmediate(Immed);
18356}
18357
18358/// isLegalAddressingMode - Return true if the addressing mode represented
18359/// by AM is legal for this target, for a load/store of the specified type.
18361 const AddrMode &AMode, Type *Ty,
18362 unsigned AS, Instruction *I) const {
18363 // AArch64 has five basic addressing modes:
18364 // reg
18365 // reg + 9-bit signed offset
18366 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
18367 // reg1 + reg2
18368 // reg + SIZE_IN_BYTES * reg
18369
18370 // No global is ever allowed as a base.
18371 if (AMode.BaseGV)
18372 return false;
18373
18374 // No reg+reg+imm addressing.
18375 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
18376 return false;
18377
18378 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
18379 // `2*ScaledReg` into `BaseReg + ScaledReg`
18380 AddrMode AM = AMode;
18381 if (AM.Scale && !AM.HasBaseReg) {
18382 if (AM.Scale == 1) {
18383 AM.HasBaseReg = true;
18384 AM.Scale = 0;
18385 } else if (AM.Scale == 2) {
18386 AM.HasBaseReg = true;
18387 AM.Scale = 1;
18388 } else {
18389 return false;
18390 }
18391 }
18392
18393 // A base register is required in all addressing modes.
18394 if (!AM.HasBaseReg)
18395 return false;
18396
18397 if (Ty->isScalableTy()) {
18398 if (isa<ScalableVectorType>(Ty)) {
18399 // See if we have a foldable vscale-based offset, for vector types which
18400 // are either legal or smaller than the minimum; more work will be
18401 // required if we need to consider addressing for types which need
18402 // legalization by splitting.
18403 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
18404 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
18405 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
18406 isPowerOf2_64(VecNumBytes))
18407 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
18408
18409 uint64_t VecElemNumBytes =
18410 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
18411 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
18412 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
18413 }
18414
18415 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
18416 }
18417
18418 // No scalable offsets allowed for non-scalable types.
18419 if (AM.ScalableOffset)
18420 return false;
18421
18422 // check reg + imm case:
18423 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
18424 uint64_t NumBytes = 0;
18425 if (Ty->isSized()) {
18426 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
18427 NumBytes = NumBits / 8;
18428 if (!isPowerOf2_64(NumBits))
18429 NumBytes = 0;
18430 }
18431
18432 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
18433 AM.Scale);
18434}
18435
18436// Check whether the 2 offsets belong to the same imm24 range, and their high
18437// 12bits are same, then their high part can be decoded with the offset of add.
18438int64_t
18440 int64_t MaxOffset) const {
18441 int64_t HighPart = MinOffset & ~0xfffULL;
18442 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
18443 // Rebase the value to an integer multiple of imm12.
18444 return HighPart;
18445 }
18446
18447 return 0;
18448}
18449
18451 // Consider splitting large offset of struct or array.
18452 return true;
18453}
18454
18456 const MachineFunction &MF, EVT VT) const {
18457 EVT ScalarVT = VT.getScalarType();
18458
18459 if (!ScalarVT.isSimple())
18460 return false;
18461
18462 switch (ScalarVT.getSimpleVT().SimpleTy) {
18463 case MVT::f16:
18464 return Subtarget->hasFullFP16();
18465 case MVT::f32:
18466 case MVT::f64:
18467 return true;
18468 case MVT::bf16:
18469 return VT.isScalableVector() && Subtarget->hasSVEB16B16() &&
18470 Subtarget->isNonStreamingSVEorSME2Available();
18471 default:
18472 break;
18473 }
18474
18475 return false;
18476}
18477
18479 Type *Ty) const {
18480 switch (Ty->getScalarType()->getTypeID()) {
18481 case Type::FloatTyID:
18482 case Type::DoubleTyID:
18483 return true;
18484 default:
18485 return false;
18486 }
18487}
18488
18490 EVT VT, CodeGenOptLevel OptLevel) const {
18491 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
18493}
18494
18495const MCPhysReg *
18497 // LR is a callee-save register, but we must treat it as clobbered by any call
18498 // site. Hence we include LR in the scratch registers, which are in turn added
18499 // as implicit-defs for stackmaps and patchpoints.
18500 static const MCPhysReg ScratchRegs[] = {
18501 AArch64::X16, AArch64::X17, AArch64::LR, 0
18502 };
18503 return ScratchRegs;
18504}
18505
18507 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
18508 return RCRegs;
18509}
18510
18511bool
18513 CombineLevel Level) const {
18514 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
18515 N->getOpcode() == ISD::SRL) &&
18516 "Expected shift op");
18517
18518 SDValue ShiftLHS = N->getOperand(0);
18519 EVT VT = N->getValueType(0);
18520
18521 if (!ShiftLHS->hasOneUse())
18522 return false;
18523
18524 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
18525 !ShiftLHS.getOperand(0)->hasOneUse())
18526 return false;
18527
18528 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
18529 // combine it with shift 'N' to let it be lowered to UBFX except:
18530 // ((x >> C) & mask) << C.
18531 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
18532 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
18533 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
18534 if (isMask_64(TruncMask)) {
18535 SDValue AndLHS = ShiftLHS.getOperand(0);
18536 if (AndLHS.getOpcode() == ISD::SRL) {
18537 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
18538 if (N->getOpcode() == ISD::SHL)
18539 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
18540 return SRLC->getZExtValue() == SHLC->getZExtValue();
18541 return false;
18542 }
18543 }
18544 }
18545 }
18546 return true;
18547}
18548
18550 const SDNode *N) const {
18551 assert(N->getOpcode() == ISD::XOR &&
18552 (N->getOperand(0).getOpcode() == ISD::SHL ||
18553 N->getOperand(0).getOpcode() == ISD::SRL) &&
18554 "Expected XOR(SHIFT) pattern");
18555
18556 // Only commute if the entire NOT mask is a hidden shifted mask.
18557 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
18558 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18559 if (XorC && ShiftC) {
18560 unsigned MaskIdx, MaskLen;
18561 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
18562 unsigned ShiftAmt = ShiftC->getZExtValue();
18563 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
18564 if (N->getOperand(0).getOpcode() == ISD::SHL)
18565 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
18566 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
18567 }
18568 }
18569
18570 return false;
18571}
18572
18574 const SDNode *N, CombineLevel Level) const {
18575 assert(((N->getOpcode() == ISD::SHL &&
18576 N->getOperand(0).getOpcode() == ISD::SRL) ||
18577 (N->getOpcode() == ISD::SRL &&
18578 N->getOperand(0).getOpcode() == ISD::SHL)) &&
18579 "Expected shift-shift mask");
18580 // Don't allow multiuse shift folding with the same shift amount.
18581 if (!N->getOperand(0)->hasOneUse())
18582 return false;
18583
18584 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
18585 EVT VT = N->getValueType(0);
18586 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
18587 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18588 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
18589 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
18590 }
18591
18592 // We do not need to fold when this shifting used in specific load case:
18593 // (ldr x, (add x, (shl (srl x, c1) 2)))
18594 if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
18595 if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
18596 unsigned ShlAmt = C2->getZExtValue();
18597 if (auto ShouldADD = *N->user_begin();
18598 ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
18599 if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
18600 EVT MemVT = Load->getMemoryVT();
18601
18602 if (Load->getValueType(0).isScalableVector())
18603 return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits();
18604
18605 if (isIndexedLoadLegal(ISD::PRE_INC, MemVT))
18606 return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits();
18607 }
18608 }
18609 }
18610 }
18611
18612 return true;
18613}
18614
18616 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
18617 SDValue Y) const {
18618 return VT.isScalableVector() && isTypeLegal(VT) &&
18619 SelectOpcode == ISD::VSELECT;
18620}
18621
18623 Type *Ty) const {
18624 assert(Ty->isIntegerTy());
18625
18626 unsigned BitSize = Ty->getPrimitiveSizeInBits();
18627 if (BitSize == 0)
18628 return false;
18629
18630 int64_t Val = Imm.getSExtValue();
18631 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
18632 return true;
18633
18634 if (Val < 0)
18635 Val = ~Val;
18636 if (BitSize == 32)
18637 Val &= (1LL << 32) - 1;
18638
18639 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
18640 // MOVZ is free so return true for one or fewer MOVK.
18641 return Shift < 3;
18642}
18643
18645 unsigned Index) const {
18647 return false;
18648
18649 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
18650}
18651
18652/// Turn vector tests of the signbit in the form of:
18653/// xor (sra X, elt_size(X)-1), -1
18654/// into:
18655/// cmge X, X, #0
18657 const AArch64Subtarget *Subtarget) {
18658 EVT VT = N->getValueType(0);
18659 if (!Subtarget->hasNEON() || !VT.isVector())
18660 return SDValue();
18661
18662 // There must be a shift right algebraic before the xor, and the xor must be a
18663 // 'not' operation.
18664 SDValue Shift = N->getOperand(0);
18665 SDValue Ones = N->getOperand(1);
18666 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
18668 return SDValue();
18669
18670 // The shift should be smearing the sign bit across each vector element.
18671 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
18672 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
18673 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
18674 return SDValue();
18675
18676 SDLoc DL(N);
18677 SDValue Zero = DAG.getConstant(0, DL, Shift.getValueType());
18678 return DAG.getSetCC(DL, VT, Shift.getOperand(0), Zero, ISD::SETGE);
18679}
18680
18681// Given a vecreduce_add node, detect the below pattern and convert it to the
18682// node sequence with UABDL, [S|U]ADB and UADDLP.
18683//
18684// i32 vecreduce_add(
18685// v16i32 abs(
18686// v16i32 sub(
18687// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
18688//
18689// or
18690//
18691// i32 vecreduce_add(
18692// v16i32 zext(
18693// v16i16 abs(
18694// v16i16 sub(
18695// v16i16 [sign|zero]_extend(v16i8 a), v16i16 [sign|zero]_extend(v16i8 b))))
18696//
18697// =================>
18698// i32 vecreduce_add(
18699// v4i32 UADDLP(
18700// v8i16 add(
18701// v8i16 zext(
18702// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
18703// v8i16 zext(
18704// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
18706 SelectionDAG &DAG) {
18707 // Assumed i32 vecreduce_add
18708 if (N->getValueType(0) != MVT::i32)
18709 return SDValue();
18710
18711 SDValue VecReduceOp0 = N->getOperand(0);
18712 bool SawTrailingZext = false;
18713 // Look through an optional post-ABS ZEXT from v16i16 -> v16i32.
18714 if (VecReduceOp0.getOpcode() == ISD::ZERO_EXTEND &&
18715 VecReduceOp0->getValueType(0) == MVT::v16i32 &&
18716 VecReduceOp0->getOperand(0)->getOpcode() == ISD::ABS &&
18717 VecReduceOp0->getOperand(0)->getValueType(0) == MVT::v16i16) {
18718 SawTrailingZext = true;
18719 VecReduceOp0 = VecReduceOp0.getOperand(0);
18720 }
18721
18722 // Peel off an optional post-ABS extend (v16i16 -> v16i32).
18723 MVT AbsInputVT = SawTrailingZext ? MVT::v16i16 : MVT::v16i32;
18724 // Assumed v16i16 or v16i32 abs input
18725 unsigned Opcode = VecReduceOp0.getOpcode();
18726 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != AbsInputVT)
18727 return SDValue();
18728
18729 SDValue ABS = VecReduceOp0;
18730 // Assumed v16i16 or v16i32 sub
18731 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
18732 ABS->getOperand(0)->getValueType(0) != AbsInputVT)
18733 return SDValue();
18734
18735 SDValue SUB = ABS->getOperand(0);
18736 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
18737 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
18738 // Assumed v16i16 or v16i32 type
18739 if (SUB->getOperand(0)->getValueType(0) != AbsInputVT ||
18740 SUB->getOperand(1)->getValueType(0) != AbsInputVT)
18741 return SDValue();
18742
18743 // Assumed zext or sext
18744 bool IsZExt = false;
18745 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
18746 IsZExt = true;
18747 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
18748 IsZExt = false;
18749 } else
18750 return SDValue();
18751
18752 SDValue EXT0 = SUB->getOperand(0);
18753 SDValue EXT1 = SUB->getOperand(1);
18754 // Assumed zext's operand has v16i8 type
18755 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
18756 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
18757 return SDValue();
18758
18759 // Pattern is detected. Let's convert it to sequence of nodes.
18760 SDLoc DL(N);
18761
18762 // First, create the node pattern of UABD/SABD.
18763 SDValue UABDHigh8Op0 =
18764 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18765 DAG.getConstant(8, DL, MVT::i64));
18766 SDValue UABDHigh8Op1 =
18767 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18768 DAG.getConstant(8, DL, MVT::i64));
18769 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18770 UABDHigh8Op0, UABDHigh8Op1);
18771 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
18772
18773 // Second, create the node pattern of UABAL.
18774 SDValue UABDLo8Op0 =
18775 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18776 DAG.getConstant(0, DL, MVT::i64));
18777 SDValue UABDLo8Op1 =
18778 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18779 DAG.getConstant(0, DL, MVT::i64));
18780 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18781 UABDLo8Op0, UABDLo8Op1);
18782 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
18783 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
18784
18785 // Third, create the node of UADDLP.
18786 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
18787
18788 // Fourth, create the node of VECREDUCE_ADD.
18789 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
18790}
18791
18792static SDValue
18794 const AArch64Subtarget *ST) {
18795 if (DCI.isBeforeLegalize())
18796 return SDValue();
18797
18798 if (SDValue While = optimizeIncrementingWhile(N, DCI.DAG, /*IsSigned=*/false,
18799 /*IsEqual=*/false))
18800 return While;
18801
18802 if (!N->getValueType(0).isScalableVector() ||
18803 (!ST->hasSVE2p1() && !(ST->hasSME2() && ST->isStreaming())))
18804 return SDValue();
18805
18806 unsigned NumUses = N->use_size();
18807 auto MaskEC = N->getValueType(0).getVectorElementCount();
18808 if (!MaskEC.isKnownMultipleOf(NumUses))
18809 return SDValue();
18810
18811 ElementCount ExtMinEC = MaskEC.divideCoefficientBy(NumUses);
18812 if (ExtMinEC.getKnownMinValue() < 2)
18813 return SDValue();
18814
18815 SmallVector<SDNode *> Extracts(NumUses, nullptr);
18816 for (SDNode *Use : N->users()) {
18817 if (Use->getOpcode() != ISD::EXTRACT_SUBVECTOR)
18818 return SDValue();
18819
18820 // Ensure the extract type is correct (e.g. if NumUses is 4 and
18821 // the mask return type is nxv8i1, each extract should be nxv2i1.
18822 if (Use->getValueType(0).getVectorElementCount() != ExtMinEC)
18823 return SDValue();
18824
18825 // There should be exactly one extract for each part of the mask.
18826 unsigned Offset = Use->getConstantOperandVal(1);
18827 unsigned Part = Offset / ExtMinEC.getKnownMinValue();
18828 if (Extracts[Part] != nullptr)
18829 return SDValue();
18830
18831 Extracts[Part] = Use;
18832 }
18833
18834 SelectionDAG &DAG = DCI.DAG;
18835 SDLoc DL(N);
18836 SDValue ID =
18837 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
18838
18839 SDValue Idx = N->getOperand(0);
18840 SDValue TC = N->getOperand(1);
18841 EVT OpVT = Idx.getValueType();
18842 if (OpVT != MVT::i64) {
18843 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
18844 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
18845 }
18846
18847 // Create the whilelo_x2 intrinsics from each pair of extracts
18848 EVT ExtVT = Extracts[0]->getValueType(0);
18849 auto R =
18850 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
18851 DCI.CombineTo(Extracts[0], R.getValue(0));
18852 DCI.CombineTo(Extracts[1], R.getValue(1));
18853
18854 if (NumUses == 2)
18855 return SDValue(N, 0);
18856
18857 auto Elts = DAG.getElementCount(DL, OpVT, ExtVT.getVectorElementCount() * 2);
18858 for (unsigned I = 2; I < NumUses; I += 2) {
18859 // After the first whilelo_x2, we need to increment the starting value.
18860 Idx = DAG.getNode(ISD::UADDSAT, DL, OpVT, Idx, Elts);
18861 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
18862 DCI.CombineTo(Extracts[I], R.getValue(0));
18863 DCI.CombineTo(Extracts[I + 1], R.getValue(1));
18864 }
18865
18866 return SDValue(N, 0);
18867}
18868
18869// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
18870// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
18871// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
18872// If we have vectors larger than v16i8 we extract v16i8 vectors,
18873// Follow the same steps above to get DOT instructions concatenate them
18874// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
18876 const AArch64Subtarget *ST) {
18877 if (!ST->isNeonAvailable())
18878 return SDValue();
18879
18880 if (!ST->hasDotProd())
18882
18883 SDValue Op0 = N->getOperand(0);
18884 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
18885 Op0.getValueType().getVectorElementType() != MVT::i32)
18886 return SDValue();
18887
18888 unsigned ExtOpcode = Op0.getOpcode();
18889 SDValue A = Op0;
18890 SDValue B;
18891 unsigned DotOpcode;
18892 if (ExtOpcode == ISD::MUL) {
18893 A = Op0.getOperand(0);
18894 B = Op0.getOperand(1);
18895 if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
18896 return SDValue();
18897 auto OpCodeA = A.getOpcode();
18898 if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
18899 return SDValue();
18900
18901 auto OpCodeB = B.getOpcode();
18902 if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
18903 return SDValue();
18904
18905 if (OpCodeA == OpCodeB) {
18906 DotOpcode =
18907 OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;
18908 } else {
18909 // Check USDOT support support
18910 if (!ST->hasMatMulInt8())
18911 return SDValue();
18912 DotOpcode = AArch64ISD::USDOT;
18913 if (OpCodeA == ISD::SIGN_EXTEND)
18914 std::swap(A, B);
18915 }
18916 } else if (ExtOpcode == ISD::ZERO_EXTEND) {
18917 DotOpcode = AArch64ISD::UDOT;
18918 } else if (ExtOpcode == ISD::SIGN_EXTEND) {
18919 DotOpcode = AArch64ISD::SDOT;
18920 } else {
18921 return SDValue();
18922 }
18923
18924 EVT Op0VT = A.getOperand(0).getValueType();
18925 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
18926 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
18927 if (!IsValidElementCount || !IsValidSize)
18928 return SDValue();
18929
18930 SDLoc DL(Op0);
18931 // For non-mla reductions B can be set to 1. For MLA we take the operand of
18932 // the extend B.
18933 if (!B)
18934 B = DAG.getConstant(1, DL, Op0VT);
18935 else
18936 B = B.getOperand(0);
18937
18938 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
18939 unsigned NumOfVecReduce;
18940 EVT TargetType;
18941 if (IsMultipleOf16) {
18942 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
18943 TargetType = MVT::v4i32;
18944 } else {
18945 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
18946 TargetType = MVT::v2i32;
18947 }
18948 // Handle the case where we need to generate only one Dot operation.
18949 if (NumOfVecReduce == 1) {
18950 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
18951 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
18952 A.getOperand(0), B);
18953 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
18954 }
18955 // Generate Dot instructions that are multiple of 16.
18956 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
18957 SmallVector<SDValue, 4> SDotVec16;
18958 unsigned I = 0;
18959 for (; I < VecReduce16Num; I += 1) {
18960 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
18961 SDValue Op0 =
18962 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
18963 DAG.getConstant(I * 16, DL, MVT::i64));
18964 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
18965 DAG.getConstant(I * 16, DL, MVT::i64));
18966 SDValue Dot =
18967 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
18968 SDotVec16.push_back(Dot);
18969 }
18970 // Concatenate dot operations.
18971 EVT SDot16EVT =
18972 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
18973 SDValue ConcatSDot16 =
18974 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
18975 SDValue VecReduceAdd16 =
18976 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
18977 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
18978 if (VecReduce8Num == 0)
18979 return VecReduceAdd16;
18980
18981 // Generate the remainder Dot operation that is multiple of 8.
18982 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
18983 SDValue Vec8Op0 =
18984 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
18985 DAG.getConstant(I * 16, DL, MVT::i64));
18986 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
18987 DAG.getConstant(I * 16, DL, MVT::i64));
18988 SDValue Dot =
18989 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
18990 SDValue VecReduceAdd8 =
18991 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
18992 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
18993 VecReduceAdd8);
18994}
18995
18996// Given an (integer) vecreduce, we know the order of the inputs does not
18997// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
18998// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
18999// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
19001 auto DetectAddExtract = [&](SDValue A) {
19002 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
19003 // UADDLP(x) if found.
19004 assert(A.getOpcode() == ISD::ADD);
19005 EVT VT = A.getValueType();
19006 SDValue Op0 = A.getOperand(0);
19007 SDValue Op1 = A.getOperand(1);
19008 if (Op0.getOpcode() != Op1.getOpcode() ||
19009 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
19010 Op0.getOpcode() != ISD::SIGN_EXTEND))
19011 return SDValue();
19012 SDValue Ext0 = Op0.getOperand(0);
19013 SDValue Ext1 = Op1.getOperand(0);
19014 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
19016 Ext0.getOperand(0) != Ext1.getOperand(0))
19017 return SDValue();
19018 // Check that the type is twice the add types, and the extract are from
19019 // upper/lower parts of the same source.
19021 VT.getVectorNumElements() * 2)
19022 return SDValue();
19023 if ((Ext0.getConstantOperandVal(1) != 0 ||
19025 (Ext1.getConstantOperandVal(1) != 0 ||
19027 return SDValue();
19028 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
19029 : AArch64ISD::SADDLP;
19030 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
19031 };
19032
19033 if (SDValue R = DetectAddExtract(A))
19034 return R;
19035
19036 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
19037 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
19038 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19039 A.getOperand(1));
19040 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
19041 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
19042 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19043 A.getOperand(0));
19044 return SDValue();
19045}
19046
19047// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
19048// UADDLV(concat), where the concat represents the 64-bit zext sources.
19050 // Look for add(zext(64-bit source), zext(64-bit source)), returning
19051 // UADDLV(concat(zext, zext)) if found.
19052 assert(A.getOpcode() == ISD::ADD);
19053 EVT VT = A.getValueType();
19054 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19055 return SDValue();
19056 SDValue Op0 = A.getOperand(0);
19057 SDValue Op1 = A.getOperand(1);
19058 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
19059 return SDValue();
19060 SDValue Ext0 = Op0.getOperand(0);
19061 SDValue Ext1 = Op1.getOperand(0);
19062 EVT ExtVT0 = Ext0.getValueType();
19063 EVT ExtVT1 = Ext1.getValueType();
19064 // Check zext VTs are the same and 64-bit length.
19065 if (ExtVT0 != ExtVT1 ||
19066 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
19067 return SDValue();
19068 // Get VT for concat of zext sources.
19069 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
19070 SDValue Concat =
19071 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
19072
19073 switch (VT.getSimpleVT().SimpleTy) {
19074 case MVT::v2i64:
19075 case MVT::v4i32:
19076 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
19077 case MVT::v8i16: {
19078 SDValue Uaddlv =
19079 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
19080 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
19081 }
19082 default:
19083 llvm_unreachable("Unhandled vector type");
19084 }
19085}
19086
19088 SDValue A = N->getOperand(0);
19089 if (A.getOpcode() == ISD::ADD) {
19090 if (SDValue R = performUADDVAddCombine(A, DAG))
19091 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
19092 else if (SDValue R = performUADDVZextCombine(A, DAG))
19093 return R;
19094 }
19095
19096 // uaddv(A) --> A if all lanes of A are known to be zeros except the 0th lane.
19097 MVT OpVT = A.getSimpleValueType();
19098 assert(N->getSimpleValueType(0) == OpVT &&
19099 "The operand type should be consistent with the result type of UADDV");
19101 Mask.clearBit(0);
19102 KnownBits KnownLeadingLanes = DAG.computeKnownBits(A, Mask);
19103 if (KnownLeadingLanes.isZero())
19104 return A;
19105
19106 return SDValue();
19107}
19108
19111 const AArch64Subtarget *Subtarget) {
19112 if (DCI.isBeforeLegalizeOps())
19113 return SDValue();
19114
19115 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
19116}
19117
19118SDValue
19119AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
19120 SelectionDAG &DAG,
19121 SmallVectorImpl<SDNode *> &Created) const {
19122 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
19123 if (isIntDivCheap(N->getValueType(0), Attr))
19124 return SDValue(N, 0); // Lower SDIV as SDIV
19125
19126 EVT VT = N->getValueType(0);
19127
19128 // If SVE is available, we can generate
19129 // sdiv(x,y) -> ptrue + asrd , where 'y' is positive pow-2 divisor.
19130 // sdiv(x,y) -> ptrue + asrd + subr , where 'y' is negative pow-2 divisor.
19131 if (VT.isVector() && Subtarget->isSVEorStreamingSVEAvailable())
19132 return SDValue(N, 0);
19133
19134 // fold (sdiv X, pow2)
19135 if ((VT != MVT::i32 && VT != MVT::i64) ||
19136 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
19137 return SDValue();
19138
19139 // If the divisor is 2 or -2, the default expansion is better. It will add
19140 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
19141 if (Divisor == 2 ||
19142 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
19143 return SDValue();
19144
19145 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
19146}
19147
19148SDValue
19149AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
19150 SelectionDAG &DAG,
19151 SmallVectorImpl<SDNode *> &Created) const {
19152 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
19153 if (isIntDivCheap(N->getValueType(0), Attr))
19154 return SDValue(N, 0); // Lower SREM as SREM
19155
19156 EVT VT = N->getValueType(0);
19157
19158 // For scalable and fixed types, mark them as cheap so we can handle it much
19159 // later. This allows us to handle larger than legal types.
19160 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
19161 return SDValue(N, 0);
19162
19163 // fold (srem X, pow2)
19164 if ((VT != MVT::i32 && VT != MVT::i64) ||
19165 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
19166 return SDValue();
19167
19168 unsigned Lg2 = Divisor.countr_zero();
19169 if (Lg2 == 0)
19170 return SDValue();
19171
19172 SDLoc DL(N);
19173 SDValue N0 = N->getOperand(0);
19174 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
19175 SDValue Zero = DAG.getConstant(0, DL, VT);
19176 SDValue CCVal, CSNeg;
19177 if (Lg2 == 1) {
19178 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
19179 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19180 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
19181
19182 Created.push_back(Cmp.getNode());
19183 Created.push_back(And.getNode());
19184 } else {
19185 SDValue CCVal = getCondCode(DAG, AArch64CC::MI);
19186 SDVTList VTs = DAG.getVTList(VT, FlagsVT);
19187
19188 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
19189 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19190 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
19191 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
19192 Negs.getValue(1));
19193
19194 Created.push_back(Negs.getNode());
19195 Created.push_back(AndPos.getNode());
19196 Created.push_back(AndNeg.getNode());
19197 }
19198
19199 return CSNeg;
19200}
19201
19202static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
19203 switch(getIntrinsicID(S.getNode())) {
19204 default:
19205 break;
19206 case Intrinsic::aarch64_sve_cntb:
19207 return 8;
19208 case Intrinsic::aarch64_sve_cnth:
19209 return 16;
19210 case Intrinsic::aarch64_sve_cntw:
19211 return 32;
19212 case Intrinsic::aarch64_sve_cntd:
19213 return 64;
19214 }
19215 return {};
19216}
19217
19218/// Calculates what the pre-extend type is, based on the extension
19219/// operation node provided by \p Extend.
19220///
19221/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
19222/// pre-extend type is pulled directly from the operand, while other extend
19223/// operations need a bit more inspection to get this information.
19224///
19225/// \param Extend The SDNode from the DAG that represents the extend operation
19226///
19227/// \returns The type representing the \p Extend source type, or \p MVT::Other
19228/// if no valid type can be determined
19230 switch (Extend.getOpcode()) {
19231 case ISD::SIGN_EXTEND:
19232 case ISD::ZERO_EXTEND:
19233 case ISD::ANY_EXTEND:
19234 return Extend.getOperand(0).getValueType();
19235 case ISD::AssertSext:
19236 case ISD::AssertZext:
19238 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
19239 if (!TypeNode)
19240 return MVT::Other;
19241 return TypeNode->getVT();
19242 }
19243 case ISD::AND: {
19246 if (!Constant)
19247 return MVT::Other;
19248
19249 uint32_t Mask = Constant->getZExtValue();
19250
19251 if (Mask == UCHAR_MAX)
19252 return MVT::i8;
19253 else if (Mask == USHRT_MAX)
19254 return MVT::i16;
19255 else if (Mask == UINT_MAX)
19256 return MVT::i32;
19257
19258 return MVT::Other;
19259 }
19260 default:
19261 return MVT::Other;
19262 }
19263}
19264
19265/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
19266/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
19267/// SExt/ZExt rather than the scalar SExt/ZExt
19269 EVT VT = BV.getValueType();
19270 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
19272 return SDValue();
19273
19274 // Use the first item in the buildvector/shuffle to get the size of the
19275 // extend, and make sure it looks valid.
19276 SDValue Extend = BV->getOperand(0);
19277 unsigned ExtendOpcode = Extend.getOpcode();
19278 bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
19279 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
19280 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
19281 ExtendOpcode == ISD::AssertSext;
19282 if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
19283 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
19284 return SDValue();
19285 // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
19286 // ensure calculatePreExtendType will work without issue.
19287 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
19288 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
19289 return SDValue();
19290
19291 // Restrict valid pre-extend data type
19292 EVT PreExtendType = calculatePreExtendType(Extend);
19293 if (PreExtendType == MVT::Other ||
19294 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
19295 return SDValue();
19296
19297 // Make sure all other operands are equally extended.
19298 bool SeenZExtOrSExt = !IsAnyExt;
19299 for (SDValue Op : drop_begin(BV->ops())) {
19300 if (Op.isUndef())
19301 continue;
19302
19303 if (calculatePreExtendType(Op) != PreExtendType)
19304 return SDValue();
19305
19306 unsigned Opc = Op.getOpcode();
19307 if (Opc == ISD::ANY_EXTEND)
19308 continue;
19309
19310 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
19312
19313 if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
19314 return SDValue();
19315
19316 IsSExt = OpcIsSExt;
19317 SeenZExtOrSExt = true;
19318 }
19319
19320 SDValue NBV;
19321 SDLoc DL(BV);
19322 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
19323 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
19324 EVT PreExtendLegalType =
19325 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
19327 for (SDValue Op : BV->ops())
19328 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
19329 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
19330 PreExtendLegalType));
19331 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
19332 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
19333 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
19334 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
19335 BV.getOperand(1).isUndef()
19336 ? DAG.getUNDEF(PreExtendVT)
19337 : BV.getOperand(1).getOperand(0),
19338 cast<ShuffleVectorSDNode>(BV)->getMask());
19339 }
19340 unsigned ExtOpc = !SeenZExtOrSExt
19342 : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
19343 return DAG.getNode(ExtOpc, DL, VT, NBV);
19344}
19345
19346/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
19347/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
19349 // If the value type isn't a vector, none of the operands are going to be dups
19350 EVT VT = Mul->getValueType(0);
19351 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19352 return SDValue();
19353
19354 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
19355 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
19356
19357 // Neither operands have been changed, don't make any further changes
19358 if (!Op0 && !Op1)
19359 return SDValue();
19360
19361 SDLoc DL(Mul);
19362 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
19363 Op1 ? Op1 : Mul->getOperand(1));
19364}
19365
19366// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
19367// Same for other types with equivalent constants.
19369 EVT VT = N->getValueType(0);
19370 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
19371 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
19372 return SDValue();
19373 if (N->getOperand(0).getOpcode() != ISD::AND ||
19374 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
19375 return SDValue();
19376
19377 SDValue And = N->getOperand(0);
19378 SDValue Srl = And.getOperand(0);
19379
19380 APInt V1, V2, V3;
19381 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
19382 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
19384 return SDValue();
19385
19386 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
19387 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
19388 V3 != (HalfSize - 1))
19389 return SDValue();
19390
19391 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19392 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
19393 VT.getVectorElementCount() * 2);
19394
19395 SDLoc DL(N);
19396 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
19397 SDValue Zero = DAG.getConstant(0, DL, In.getValueType());
19398 SDValue CM = DAG.getSetCC(DL, HalfVT, Zero, In, ISD::SETGT);
19399 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
19400}
19401
19402// Transform vector add(zext i8 to i32, zext i8 to i32)
19403// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
19404// This allows extra uses of saddl/uaddl at the lower vector widths, and less
19405// extends.
19407 EVT VT = N->getValueType(0);
19408 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
19409 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
19410 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
19411 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
19412 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
19413 N->getOperand(0).getOperand(0).getValueType() !=
19414 N->getOperand(1).getOperand(0).getValueType())
19415 return SDValue();
19416
19417 if (N->getOpcode() == ISD::MUL &&
19418 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
19419 return SDValue();
19420
19421 SDValue N0 = N->getOperand(0).getOperand(0);
19422 SDValue N1 = N->getOperand(1).getOperand(0);
19423 EVT InVT = N0.getValueType();
19424
19425 EVT S1 = InVT.getScalarType();
19426 EVT S2 = VT.getScalarType();
19427 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19428 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19429 SDLoc DL(N);
19430 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19433 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
19434 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
19435 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
19436 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
19437 : (unsigned)ISD::SIGN_EXTEND,
19438 DL, VT, NewOp);
19439 }
19440 return SDValue();
19441}
19442
19445 const AArch64Subtarget *Subtarget) {
19446
19447 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
19448 return Ext;
19450 return Ext;
19451 if (SDValue Ext = performVectorExtCombine(N, DAG))
19452 return Ext;
19453
19454 if (DCI.isBeforeLegalizeOps())
19455 return SDValue();
19456
19457 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
19458 // and in MachineCombiner pass, add+mul will be combined into madd.
19459 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
19460 SDLoc DL(N);
19461 EVT VT = N->getValueType(0);
19462 SDValue N0 = N->getOperand(0);
19463 SDValue N1 = N->getOperand(1);
19464 SDValue MulOper;
19465 unsigned AddSubOpc;
19466
19467 auto IsAddSubWith1 = [&](SDValue V) -> bool {
19468 AddSubOpc = V->getOpcode();
19469 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
19470 SDValue Opnd = V->getOperand(1);
19471 MulOper = V->getOperand(0);
19472 if (AddSubOpc == ISD::SUB)
19473 std::swap(Opnd, MulOper);
19474 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
19475 return C->isOne();
19476 }
19477 return false;
19478 };
19479
19480 if (IsAddSubWith1(N0)) {
19481 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
19482 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
19483 }
19484
19485 if (IsAddSubWith1(N1)) {
19486 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
19487 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
19488 }
19489
19490 // The below optimizations require a constant RHS.
19491 if (!isa<ConstantSDNode>(N1))
19492 return SDValue();
19493
19495 const APInt &ConstValue = C->getAPIntValue();
19496
19497 // Allow the scaling to be folded into the `cnt` instruction by preventing
19498 // the scaling to be obscured here. This makes it easier to pattern match.
19499 if (IsSVECntIntrinsic(N0) ||
19500 (N0->getOpcode() == ISD::TRUNCATE &&
19501 (IsSVECntIntrinsic(N0->getOperand(0)))))
19502 if (ConstValue.sge(1) && ConstValue.sle(16))
19503 return SDValue();
19504
19505 // Multiplication of a power of two plus/minus one can be done more
19506 // cheaply as shift+add/sub. For now, this is true unilaterally. If
19507 // future CPUs have a cheaper MADD instruction, this may need to be
19508 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
19509 // 64-bit is 5 cycles, so this is always a win.
19510 // More aggressively, some multiplications N0 * C can be lowered to
19511 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
19512 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
19513 // TODO: lower more cases.
19514
19515 // TrailingZeroes is used to test if the mul can be lowered to
19516 // shift+add+shift.
19517 unsigned TrailingZeroes = ConstValue.countr_zero();
19518 if (TrailingZeroes) {
19519 // Conservatively do not lower to shift+add+shift if the mul might be
19520 // folded into smul or umul.
19521 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
19522 isZeroExtended(N0, DAG)))
19523 return SDValue();
19524 // Conservatively do not lower to shift+add+shift if the mul might be
19525 // folded into madd or msub.
19526 if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
19527 N->user_begin()->getOpcode() == ISD::SUB))
19528 return SDValue();
19529 }
19530 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
19531 // and shift+add+shift.
19532 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
19533 unsigned ShiftAmt;
19534
19535 auto Shl = [&](SDValue N0, unsigned N1) {
19536 if (!N0.getNode())
19537 return SDValue();
19538 // If shift causes overflow, ignore this combine.
19539 if (N1 >= N0.getValueSizeInBits())
19540 return SDValue();
19541 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
19542 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
19543 };
19544 auto Add = [&](SDValue N0, SDValue N1) {
19545 if (!N0.getNode() || !N1.getNode())
19546 return SDValue();
19547 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
19548 };
19549 auto Sub = [&](SDValue N0, SDValue N1) {
19550 if (!N0.getNode() || !N1.getNode())
19551 return SDValue();
19552 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
19553 };
19554 auto Negate = [&](SDValue N) {
19555 if (!N0.getNode())
19556 return SDValue();
19557 SDValue Zero = DAG.getConstant(0, DL, VT);
19558 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
19559 };
19560
19561 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
19562 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
19563 // the (2^N - 1) can't be execused via a single instruction.
19564 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
19565 unsigned BitWidth = C.getBitWidth();
19566 for (unsigned i = 1; i < BitWidth / 2; i++) {
19567 APInt Rem;
19568 APInt X(BitWidth, (1 << i) + 1);
19569 APInt::sdivrem(C, X, N, Rem);
19570 APInt NVMinus1 = N - 1;
19571 if (Rem == 0 && NVMinus1.isPowerOf2()) {
19572 M = X;
19573 return true;
19574 }
19575 }
19576 return false;
19577 };
19578
19579 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
19580 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
19581 // the (2^N - 1) can't be execused via a single instruction.
19582 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
19583 APInt CVMinus1 = C - 1;
19584 if (CVMinus1.isNegative())
19585 return false;
19586 unsigned TrailingZeroes = CVMinus1.countr_zero();
19587 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
19588 if (SCVMinus1.isPowerOf2()) {
19589 unsigned BitWidth = SCVMinus1.getBitWidth();
19590 M = APInt(BitWidth, SCVMinus1.logBase2());
19591 N = APInt(BitWidth, TrailingZeroes);
19592 return true;
19593 }
19594 return false;
19595 };
19596
19597 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
19598 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
19599 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
19600 APInt CVMinus1 = C - 1;
19601 if (CVMinus1.isNegative())
19602 return false;
19603 unsigned TrailingZeroes = CVMinus1.countr_zero();
19604 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
19605 if (CVPlus1.isPowerOf2()) {
19606 unsigned BitWidth = CVPlus1.getBitWidth();
19607 M = APInt(BitWidth, CVPlus1.logBase2());
19608 N = APInt(BitWidth, TrailingZeroes);
19609 return true;
19610 }
19611 return false;
19612 };
19613
19614 if (ConstValue.isNonNegative()) {
19615 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
19616 // (mul x, 2^N - 1) => (sub (shl x, N), x)
19617 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
19618 // (mul x, (2^M + 1) * (2^N + 1))
19619 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
19620 // (mul x, (2^M + 1) * 2^N + 1))
19621 // => MV = add (shl x, M), x); add (shl MV, N), x)
19622 // (mul x, 1 - (1 - 2^M) * 2^N))
19623 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
19624 APInt SCVMinus1 = ShiftedConstValue - 1;
19625 APInt SCVPlus1 = ShiftedConstValue + 1;
19626 APInt CVPlus1 = ConstValue + 1;
19627 APInt CVM, CVN;
19628 if (SCVMinus1.isPowerOf2()) {
19629 ShiftAmt = SCVMinus1.logBase2();
19630 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
19631 } else if (CVPlus1.isPowerOf2()) {
19632 ShiftAmt = CVPlus1.logBase2();
19633 return Sub(Shl(N0, ShiftAmt), N0);
19634 } else if (SCVPlus1.isPowerOf2()) {
19635 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19636 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
19637 }
19638 if (Subtarget->hasALULSLFast() &&
19639 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
19640 APInt CVMMinus1 = CVM - 1;
19641 APInt CVNMinus1 = CVN - 1;
19642 unsigned ShiftM1 = CVMMinus1.logBase2();
19643 unsigned ShiftN1 = CVNMinus1.logBase2();
19644 // ALULSLFast implicate that Shifts <= 4 places are fast
19645 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
19646 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
19647 return Add(Shl(MVal, ShiftN1), MVal);
19648 }
19649 }
19650 if (Subtarget->hasALULSLFast() &&
19651 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
19652 unsigned ShiftM = CVM.getZExtValue();
19653 unsigned ShiftN = CVN.getZExtValue();
19654 // ALULSLFast implicate that Shifts <= 4 places are fast
19655 if (ShiftM <= 4 && ShiftN <= 4) {
19656 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
19657 return Add(Shl(MVal, CVN.getZExtValue()), N0);
19658 }
19659 }
19660
19661 if (Subtarget->hasALULSLFast() &&
19662 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
19663 unsigned ShiftM = CVM.getZExtValue();
19664 unsigned ShiftN = CVN.getZExtValue();
19665 // ALULSLFast implicate that Shifts <= 4 places are fast
19666 if (ShiftM <= 4 && ShiftN <= 4) {
19667 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
19668 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
19669 }
19670 }
19671 } else {
19672 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
19673 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
19674 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
19675 APInt SCVPlus1 = -ShiftedConstValue + 1;
19676 APInt CVNegPlus1 = -ConstValue + 1;
19677 APInt CVNegMinus1 = -ConstValue - 1;
19678 if (CVNegPlus1.isPowerOf2()) {
19679 ShiftAmt = CVNegPlus1.logBase2();
19680 return Sub(N0, Shl(N0, ShiftAmt));
19681 } else if (CVNegMinus1.isPowerOf2()) {
19682 ShiftAmt = CVNegMinus1.logBase2();
19683 return Negate(Add(Shl(N0, ShiftAmt), N0));
19684 } else if (SCVPlus1.isPowerOf2()) {
19685 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19686 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
19687 }
19688 }
19689
19690 return SDValue();
19691}
19692
19694 SelectionDAG &DAG) {
19695 // Take advantage of vector comparisons producing 0 or -1 in each lane to
19696 // optimize away operation when it's from a constant.
19697 //
19698 // The general transformation is:
19699 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
19700 // AND(VECTOR_CMP(x,y), constant2)
19701 // constant2 = UNARYOP(constant)
19702
19703 // Early exit if this isn't a vector operation, the operand of the
19704 // unary operation isn't a bitwise AND, or if the sizes of the operations
19705 // aren't the same.
19706 EVT VT = N->getValueType(0);
19707 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
19708 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
19709 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
19710 return SDValue();
19711
19712 // Now check that the other operand of the AND is a constant. We could
19713 // make the transformation for non-constant splats as well, but it's unclear
19714 // that would be a benefit as it would not eliminate any operations, just
19715 // perform one more step in scalar code before moving to the vector unit.
19716 if (BuildVectorSDNode *BV =
19717 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
19718 // Bail out if the vector isn't a constant.
19719 if (!BV->isConstant())
19720 return SDValue();
19721
19722 // Everything checks out. Build up the new and improved node.
19723 SDLoc DL(N);
19724 EVT IntVT = BV->getValueType(0);
19725 // Create a new constant of the appropriate type for the transformed
19726 // DAG.
19727 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
19728 // The AND node needs bitcasts to/from an integer vector type around it.
19729 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
19730 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
19731 N->getOperand(0)->getOperand(0), MaskConst);
19732 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
19733 return Res;
19734 }
19735
19736 return SDValue();
19737}
19738
19739/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
19740/// functions, this can help to reduce the number of fmovs to/from GPRs.
19741static SDValue
19744 const AArch64Subtarget *Subtarget) {
19745 if (N->isStrictFPOpcode())
19746 return SDValue();
19747
19748 if (DCI.isBeforeLegalizeOps())
19749 return SDValue();
19750
19751 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
19752 (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
19753 return SDValue();
19754
19755 auto isSupportedType = [](EVT VT) {
19756 return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
19757 };
19758
19759 SDValue SrcVal = N->getOperand(0);
19760 EVT SrcTy = SrcVal.getValueType();
19761 EVT DestTy = N->getValueType(0);
19762
19763 if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
19764 return SDValue();
19765
19766 EVT SrcVecTy;
19767 EVT DestVecTy;
19768 if (DestTy.bitsGT(SrcTy)) {
19769 DestVecTy = getPackedSVEVectorVT(DestTy);
19770 SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
19771 } else {
19772 SrcVecTy = getPackedSVEVectorVT(SrcTy);
19773 DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
19774 }
19775
19776 // Ensure the resulting src/dest vector type is legal.
19777 if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
19778 return SDValue();
19779
19780 SDLoc DL(N);
19781 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19782 SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
19783 DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
19784 SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
19785 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
19786}
19787
19790 const AArch64Subtarget *Subtarget) {
19791 // First try to optimize away the conversion when it's conditionally from
19792 // a constant. Vectors only.
19794 return Res;
19795
19796 if (SDValue Res =
19797 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19798 return Res;
19799
19800 EVT VT = N->getValueType(0);
19801 if (VT != MVT::f32 && VT != MVT::f64)
19802 return SDValue();
19803
19804 // Only optimize when the source and destination types have the same width.
19805 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
19806 return SDValue();
19807
19808 // If the result of an integer load is only used by an integer-to-float
19809 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
19810 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
19811 SDValue N0 = N->getOperand(0);
19812 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
19813 N0.hasOneUse() &&
19814 // Do not change the width of a volatile load.
19815 !cast<LoadSDNode>(N0)->isVolatile()) {
19816 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
19817 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
19818 LN0->getPointerInfo(), LN0->getAlign(),
19819 LN0->getMemOperand()->getFlags());
19820
19821 // Make sure successors of the original load stay after it by updating them
19822 // to use the new Chain.
19823 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
19824
19825 unsigned Opcode =
19826 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
19827 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
19828 }
19829
19830 return SDValue();
19831}
19832
19833/// Fold a floating-point multiply by power of two into floating-point to
19834/// fixed-point conversion.
19837 const AArch64Subtarget *Subtarget) {
19838 if (SDValue Res =
19839 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19840 return Res;
19841
19842 if (!Subtarget->isNeonAvailable())
19843 return SDValue();
19844
19845 if (!N->getValueType(0).isSimple())
19846 return SDValue();
19847
19848 SDValue Op = N->getOperand(0);
19849 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
19850 return SDValue();
19851
19852 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
19853 return SDValue();
19854
19855 SDValue ConstVec = Op->getOperand(1);
19856 if (!isa<BuildVectorSDNode>(ConstVec))
19857 return SDValue();
19858
19859 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
19860 uint32_t FloatBits = FloatTy.getSizeInBits();
19861 if (FloatBits != 32 && FloatBits != 64 &&
19862 (FloatBits != 16 || !Subtarget->hasFullFP16()))
19863 return SDValue();
19864
19865 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
19866 uint32_t IntBits = IntTy.getSizeInBits();
19867 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
19868 return SDValue();
19869
19870 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
19871 if (IntBits > FloatBits)
19872 return SDValue();
19873
19874 BitVector UndefElements;
19876 int32_t Bits = IntBits == 64 ? 64 : 32;
19877 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
19878 if (C == -1 || C == 0 || C > Bits)
19879 return SDValue();
19880
19881 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
19882 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
19883 return SDValue();
19884
19885 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
19886 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
19887 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
19888 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
19889 return SDValue();
19890 }
19891
19892 SDLoc DL(N);
19893 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
19894 N->getOpcode() == ISD::FP_TO_SINT_SAT);
19895 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
19896 : Intrinsic::aarch64_neon_vcvtfp2fxu;
19897 SDValue FixConv =
19899 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
19900 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
19901 // We can handle smaller integers by generating an extra trunc.
19902 if (IntBits < FloatBits)
19903 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
19904
19905 return FixConv;
19906}
19907
19908// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
19909// convert to csel(ccmp(.., cc0)), depending on cc1:
19910
19911// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19912// =>
19913// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
19914//
19915// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19916// =>
19917// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
19919 EVT VT = N->getValueType(0);
19920 SDValue CSel0 = N->getOperand(0);
19921 SDValue CSel1 = N->getOperand(1);
19922
19923 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
19924 CSel1.getOpcode() != AArch64ISD::CSEL)
19925 return SDValue();
19926
19927 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
19928 return SDValue();
19929
19930 if (!isNullConstant(CSel0.getOperand(0)) ||
19931 !isOneConstant(CSel0.getOperand(1)) ||
19932 !isNullConstant(CSel1.getOperand(0)) ||
19933 !isOneConstant(CSel1.getOperand(1)))
19934 return SDValue();
19935
19936 SDValue Cmp0 = CSel0.getOperand(3);
19937 SDValue Cmp1 = CSel1.getOperand(3);
19940 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
19941 return SDValue();
19942 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
19943 Cmp0.getOpcode() == AArch64ISD::SUBS) {
19944 std::swap(Cmp0, Cmp1);
19945 std::swap(CC0, CC1);
19946 }
19947
19948 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
19949 return SDValue();
19950
19951 SDLoc DL(N);
19952 SDValue CCmp, Condition;
19953 unsigned NZCV;
19954
19955 if (N->getOpcode() == ISD::AND) {
19957 Condition = getCondCode(DAG, InvCC0);
19959 } else {
19961 Condition = getCondCode(DAG, CC0);
19963 }
19964
19965 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
19966
19967 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
19968 if (Op1 && Op1->getAPIntValue().isNegative() &&
19969 Op1->getAPIntValue().sgt(-32)) {
19970 // CCMP accept the constant int the range [0, 31]
19971 // if the Op1 is a constant in the range [-31, -1], we
19972 // can select to CCMN to avoid the extra mov
19973 SDValue AbsOp1 =
19974 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
19975 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, FlagsVT, Cmp1.getOperand(0),
19976 AbsOp1, NZCVOp, Condition, Cmp0);
19977 } else {
19978 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, FlagsVT, Cmp1.getOperand(0),
19979 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
19980 }
19981 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
19982 CSel0.getOperand(1), getCondCode(DAG, CC1), CCmp);
19983}
19984
19986 const AArch64Subtarget *Subtarget,
19987 const AArch64TargetLowering &TLI) {
19988 SelectionDAG &DAG = DCI.DAG;
19989
19990 if (SDValue R = performANDORCSELCombine(N, DAG))
19991 return R;
19992
19993 return SDValue();
19994}
19995
19997 if (!MemVT.getVectorElementType().isSimple())
19998 return false;
19999
20000 uint64_t MaskForTy = 0ull;
20001 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
20002 case MVT::i8:
20003 MaskForTy = 0xffull;
20004 break;
20005 case MVT::i16:
20006 MaskForTy = 0xffffull;
20007 break;
20008 case MVT::i32:
20009 MaskForTy = 0xffffffffull;
20010 break;
20011 default:
20012 return false;
20013 break;
20014 }
20015
20016 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
20017 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
20018 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
20019
20020 return false;
20021}
20022
20024 SDValue LeafOp = SDValue(N, 0);
20025 SDValue Op = N->getOperand(0);
20026 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
20027 LeafOp.getValueType() != Op.getValueType())
20028 Op = Op->getOperand(0);
20029 if (LeafOp.getValueType() == Op.getValueType())
20030 return Op;
20031 return SDValue();
20032}
20033
20036 SelectionDAG &DAG = DCI.DAG;
20037 SDValue Src = N->getOperand(0);
20038 unsigned Opc = Src->getOpcode();
20039
20040 // Zero/any extend of an unsigned unpack
20041 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
20042 SDValue UnpkOp = Src->getOperand(0);
20043 SDValue Dup = N->getOperand(1);
20044
20045 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
20046 return SDValue();
20047
20048 SDLoc DL(N);
20050 if (!C)
20051 return SDValue();
20052
20053 uint64_t ExtVal = C->getZExtValue();
20054
20055 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
20056 return ((ExtVal == 0xFF && VT == MVT::i8) ||
20057 (ExtVal == 0xFFFF && VT == MVT::i16) ||
20058 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
20059 };
20060
20061 // If the mask is fully covered by the unpack, we don't need to push
20062 // a new AND onto the operand
20063 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
20064 if (MaskAndTypeMatch(EltTy))
20065 return Src;
20066
20067 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
20068 // to see if the mask is all-ones of size MemTy.
20069 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
20070 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
20071 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
20072 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
20073 if (MaskAndTypeMatch(EltTy))
20074 return Src;
20075 }
20076
20077 // Truncate to prevent a DUP with an over wide constant
20078 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
20079
20080 // Otherwise, make sure we propagate the AND to the operand
20081 // of the unpack
20082 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
20083 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
20084
20085 SDValue And = DAG.getNode(ISD::AND, DL,
20086 UnpkOp->getValueType(0), UnpkOp, Dup);
20087
20088 return DAG.getNode(Opc, DL, N->getValueType(0), And);
20089 }
20090
20091 if (DCI.isBeforeLegalizeOps())
20092 return SDValue();
20093
20094 // If both sides of AND operations are i1 splat_vectors then
20095 // we can produce just i1 splat_vector as the result.
20096 if (isAllActivePredicate(DAG, N->getOperand(0)))
20097 return N->getOperand(1);
20098 if (isAllActivePredicate(DAG, N->getOperand(1)))
20099 return N->getOperand(0);
20100
20102 return SDValue();
20103
20104 SDValue Mask = N->getOperand(1);
20105
20106 if (!Src.hasOneUse())
20107 return SDValue();
20108
20109 EVT MemVT;
20110
20111 // SVE load instructions perform an implicit zero-extend, which makes them
20112 // perfect candidates for combining.
20113 switch (Opc) {
20114 case AArch64ISD::LD1_MERGE_ZERO:
20115 case AArch64ISD::LDNF1_MERGE_ZERO:
20116 case AArch64ISD::LDFF1_MERGE_ZERO:
20117 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
20118 break;
20119 case AArch64ISD::GLD1_MERGE_ZERO:
20120 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
20121 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
20122 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
20123 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
20124 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
20125 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
20126 case AArch64ISD::GLDFF1_MERGE_ZERO:
20127 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
20128 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
20129 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
20130 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
20131 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
20132 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
20133 case AArch64ISD::GLDNT1_MERGE_ZERO:
20134 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
20135 break;
20136 default:
20137 return SDValue();
20138 }
20139
20140 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
20141 return Src;
20142
20143 return SDValue();
20144}
20145
20146// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
20149
20150 // This function performs an optimization on a specific pattern involving
20151 // an AND operation and SETCC (Set Condition Code) node.
20152
20153 SDValue SetCC = N->getOperand(0);
20154 EVT VT = N->getValueType(0);
20155 SelectionDAG &DAG = DCI.DAG;
20156
20157 // Checks if the current node (N) is used by any SELECT instruction and
20158 // returns an empty SDValue to avoid applying the optimization to prevent
20159 // incorrect results
20160 for (auto U : N->users())
20161 if (U->getOpcode() == ISD::SELECT)
20162 return SDValue();
20163
20164 // Check if the operand is a SETCC node with floating-point comparison
20165 if (SetCC.getOpcode() == ISD::SETCC &&
20166 SetCC.getOperand(0).getValueType() == MVT::f32) {
20167
20168 SDValue Cmp;
20170
20171 // Check if the DAG is after legalization and if we can emit the conjunction
20172 if (!DCI.isBeforeLegalize() &&
20173 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
20174
20176
20177 SDLoc DL(N);
20178 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
20179 DAG.getConstant(0, DL, VT),
20180 getCondCode(DAG, InvertedCC), Cmp);
20181 }
20182 }
20183 return SDValue();
20184}
20185
20188 SelectionDAG &DAG = DCI.DAG;
20189 SDValue LHS = N->getOperand(0);
20190 SDValue RHS = N->getOperand(1);
20191 EVT VT = N->getValueType(0);
20192
20193 if (SDValue R = performANDORCSELCombine(N, DAG))
20194 return R;
20195
20196 if (SDValue R = performANDSETCCCombine(N,DCI))
20197 return R;
20198
20199 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
20200 return SDValue();
20201
20202 if (VT.isScalableVector())
20203 return performSVEAndCombine(N, DCI);
20204
20205 // The combining code below works only for NEON vectors. In particular, it
20206 // does not work for SVE when dealing with vectors wider than 128 bits.
20207 if (!VT.is64BitVector() && !VT.is128BitVector())
20208 return SDValue();
20209
20211 if (!BVN)
20212 return SDValue();
20213
20214 // AND does not accept an immediate, so check if we can use a BIC immediate
20215 // instruction instead. We do this here instead of using a (and x, (mvni imm))
20216 // pattern in isel, because some immediates may be lowered to the preferred
20217 // (and x, (movi imm)) form, even though an mvni representation also exists.
20218 APInt DefBits(VT.getSizeInBits(), 0);
20219 APInt UndefBits(VT.getSizeInBits(), 0);
20220 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
20221 SDValue NewOp;
20222
20223 // Any bits known to already be 0 need not be cleared again, which can help
20224 // reduce the size of the immediate to one supported by the instruction.
20225 KnownBits Known = DAG.computeKnownBits(LHS);
20226 APInt ZeroSplat(VT.getSizeInBits(), 0);
20227 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
20228 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
20229 << (Known.Zero.getBitWidth() * I);
20230
20231 DefBits = ~(DefBits | ZeroSplat);
20232 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
20233 DefBits, &LHS)) ||
20234 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
20235 DefBits, &LHS)))
20236 return NewOp;
20237
20238 UndefBits = ~(UndefBits | ZeroSplat);
20239 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
20240 UndefBits, &LHS)) ||
20241 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
20242 UndefBits, &LHS)))
20243 return NewOp;
20244 }
20245
20246 return SDValue();
20247}
20248
20251 SelectionDAG &DAG = DCI.DAG;
20252 SDValue LHS = N->getOperand(0);
20253 SDValue RHS = N->getOperand(1);
20254 EVT VT = N->getValueType(0);
20255 SDLoc DL(N);
20256
20257 if (!N->getFlags().hasAllowReassociation())
20258 return SDValue();
20259
20260 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
20261 auto ReassocComplex = [&](SDValue A, SDValue B) {
20262 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
20263 return SDValue();
20264 unsigned Opc = A.getConstantOperandVal(0);
20265 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
20266 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
20267 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
20268 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
20269 return SDValue();
20270 SDValue VCMLA = DAG.getNode(
20271 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
20272 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
20273 A.getOperand(2), A.getOperand(3));
20274 VCMLA->setFlags(A->getFlags());
20275 return VCMLA;
20276 };
20277 if (SDValue R = ReassocComplex(LHS, RHS))
20278 return R;
20279 if (SDValue R = ReassocComplex(RHS, LHS))
20280 return R;
20281
20282 return SDValue();
20283}
20284
20285static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
20286 switch (Opcode) {
20287 case ISD::STRICT_FADD:
20288 case ISD::FADD:
20289 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
20290 case ISD::ADD:
20291 return VT == MVT::i64;
20292 default:
20293 return false;
20294 }
20295}
20296
20297static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
20299
20301 if ((N.getOpcode() == ISD::SETCC) ||
20302 // get_active_lane_mask is lowered to a whilelo instruction.
20303 (N.getOpcode() == ISD::GET_ACTIVE_LANE_MASK) ||
20304 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
20305 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
20306 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege_x2 ||
20307 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
20308 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt_x2 ||
20309 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
20310 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi_x2 ||
20311 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
20312 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs_x2 ||
20313 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
20314 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele_x2 ||
20315 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
20316 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo_x2 ||
20317 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
20318 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels_x2 ||
20319 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
20320 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt_x2)))
20321 return true;
20322
20323 return false;
20324}
20325
20326// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
20327// ... into: "ptrue p, all" + PTEST
20328static SDValue
20331 const AArch64Subtarget *Subtarget) {
20332 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20333 // Make sure PTEST can be legalised with illegal types.
20334 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
20335 return SDValue();
20336
20337 SDValue N0 = N->getOperand(0);
20338 EVT VT = N0.getValueType();
20339
20340 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
20341 !isNullConstant(N->getOperand(1)))
20342 return SDValue();
20343
20344 // Restricted the DAG combine to only cases where we're extracting from a
20345 // flag-setting operation.
20346 if (!isPredicateCCSettingOp(N0) || N0.getResNo() != 0)
20347 return SDValue();
20348
20349 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
20350 SelectionDAG &DAG = DCI.DAG;
20351 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
20352 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
20353}
20354
20355// Materialize : Idx = (add (mul vscale, NumEls), -1)
20356// i1 = extract_vector_elt t37, Constant:i64<Idx>
20357// ... into: "ptrue p, all" + PTEST
20358static SDValue
20361 const AArch64Subtarget *Subtarget) {
20362 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20363 // Make sure PTEST is legal types.
20364 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
20365 return SDValue();
20366
20367 SDValue N0 = N->getOperand(0);
20368 EVT OpVT = N0.getValueType();
20369
20370 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
20371 return SDValue();
20372
20373 // Idx == (add (mul vscale, NumEls), -1)
20374 SDValue Idx = N->getOperand(1);
20375 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
20376 return SDValue();
20377
20378 SDValue VS = Idx.getOperand(0);
20379 if (VS.getOpcode() != ISD::VSCALE)
20380 return SDValue();
20381
20382 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
20383 if (VS.getConstantOperandVal(0) != NumEls)
20384 return SDValue();
20385
20386 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
20387 SelectionDAG &DAG = DCI.DAG;
20388 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
20389 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
20390}
20391
20392static SDValue
20394 const AArch64Subtarget *Subtarget) {
20395 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20396 SelectionDAG &DAG = DCI.DAG;
20397 SDValue Vec = N->getOperand(0);
20398 SDValue Idx = N->getOperand(1);
20399
20400 if (DCI.isBeforeLegalize() || Idx.getOpcode() != ISD::VECTOR_FIND_LAST_ACTIVE)
20401 return SDValue();
20402
20403 // Only legal for 8, 16, 32, and 64 bit element types.
20404 EVT EltVT = Vec.getValueType().getVectorElementType();
20405 if (!is_contained(ArrayRef({MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f16,
20406 MVT::bf16, MVT::f32, MVT::f64}),
20407 EltVT.getSimpleVT().SimpleTy))
20408 return SDValue();
20409
20410 SDValue Mask = Idx.getOperand(0);
20411 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20412 if (!TLI.isOperationLegal(ISD::VECTOR_FIND_LAST_ACTIVE, Mask.getValueType()))
20413 return SDValue();
20414
20415 return DAG.getNode(AArch64ISD::LASTB, SDLoc(N), N->getValueType(0), Mask,
20416 Vec);
20417}
20418
20419static SDValue
20421 const AArch64Subtarget *Subtarget) {
20422 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20423 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
20424 return Res;
20425 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
20426 return Res;
20427 if (SDValue Res = performExtractLastActiveCombine(N, DCI, Subtarget))
20428 return Res;
20429
20430 SelectionDAG &DAG = DCI.DAG;
20431 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
20432
20433 EVT VT = N->getValueType(0);
20434 const bool FullFP16 = Subtarget->hasFullFP16();
20435 bool IsStrict = N0->isStrictFPOpcode();
20436
20437 // extract(dup x) -> x
20438 if (N0.getOpcode() == AArch64ISD::DUP)
20439 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
20440 : N0.getOperand(0);
20441
20442 // Rewrite for pairwise fadd pattern
20443 // (f32 (extract_vector_elt
20444 // (fadd (vXf32 Other)
20445 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
20446 // ->
20447 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
20448 // (extract_vector_elt (vXf32 Other) 1))
20449 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
20450 // we can only do this when it's used only by the extract_vector_elt.
20451 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
20452 (!IsStrict || N0.hasOneUse())) {
20453 SDLoc DL(N0);
20454 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
20455 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
20456
20458 SDValue Other = N00;
20459
20460 // And handle the commutative case.
20461 if (!Shuffle) {
20462 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
20463 Other = N01;
20464 }
20465
20466 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
20467 Other == Shuffle->getOperand(0)) {
20468 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
20469 DAG.getConstant(0, DL, MVT::i64));
20470 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
20471 DAG.getConstant(1, DL, MVT::i64));
20472 if (!IsStrict)
20473 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
20474
20475 // For strict_fadd we need uses of the final extract_vector to be replaced
20476 // with the strict_fadd, but we also need uses of the chain output of the
20477 // original strict_fadd to use the chain output of the new strict_fadd as
20478 // otherwise it may not be deleted.
20479 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
20480 {VT, MVT::Other},
20481 {N0->getOperand(0), Extract1, Extract2});
20482 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
20483 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
20484 return SDValue(N, 0);
20485 }
20486 }
20487
20488 // Given an extract(load) or extract(extend(load)), produce a scalar load
20489 // instead to avoid the cross-register-bank copies.
20490 if (DCI.isAfterLegalizeDAG() && Subtarget->isLittleEndian() &&
20491 VT.isInteger() && isa<ConstantSDNode>(N1)) {
20492 SDValue LoadN0 = N0;
20493 // Look through sext/zext and extract_subvector / insert_subvector if
20494 // required.
20495 if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
20496 N0.getOpcode() == ISD::SIGN_EXTEND ||
20497 N0.getOpcode() == ISD::ANY_EXTEND) &&
20498 N0.getOperand(0).hasOneUse())
20499 LoadN0 = N0.getOperand(0);
20500 unsigned OffsetElts = 0;
20501 if (LoadN0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
20502 OffsetElts = LoadN0.getConstantOperandVal(1);
20503 LoadN0 = LoadN0.getOperand(0);
20504 }
20505 if (LoadN0.getOpcode() == ISD::INSERT_SUBVECTOR &&
20506 LoadN0.getOperand(0).isUndef() &&
20507 isNullConstant(LoadN0.getOperand(2)) &&
20508 LoadN0.getOperand(1).hasOneUse())
20509 LoadN0 = LoadN0.getOperand(1);
20510
20511 // Check all the uses are valid and can be scalarized. We check that all the
20512 // uses are extracts and those extracts are not re-inserted into an
20513 // operation best treated as a vector register.
20514 auto Load = dyn_cast<LoadSDNode>(LoadN0);
20515 if (Load && Load->isSimple() && ISD::isNormalLoad(Load) &&
20516 Load->getMemoryVT().isByteSized() &&
20517 all_of(N0->uses(), [&](const SDUse &U) {
20518 return U.getResNo() != N0.getResNo() ||
20519 (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20520 !any_of(U.getUser()->uses(), [](const SDUse &U2) {
20521 return U2.getUser()->getOpcode() ==
20522 ISD::INSERT_VECTOR_ELT ||
20523 U2.getUser()->getOpcode() == ISD::BUILD_VECTOR ||
20524 U2.getUser()->getOpcode() == ISD::SCALAR_TO_VECTOR;
20525 }));
20526 })) {
20527
20528 SDLoc DL(Load);
20529
20530 // Generate a new scalar load.
20531 unsigned Offset = (OffsetElts + N->getConstantOperandVal(1)) *
20532 Load->getValueType(0).getScalarSizeInBits() / 8;
20533 SDValue BasePtr = DAG.getObjectPtrOffset(
20534 DL, Load->getBasePtr(), DAG.getConstant(Offset, DL, MVT::i64));
20535 ISD::LoadExtType ExtType =
20539 : ISD::EXTLOAD);
20540 SDValue ScalarLoad =
20541 DAG.getExtLoad(ExtType, DL, VT, Load->getChain(), BasePtr,
20542 Load->getPointerInfo().getWithOffset(Offset),
20543 Load->getValueType(0).getScalarType(),
20544 commonAlignment(Load->getAlign(), Offset),
20545 Load->getMemOperand()->getFlags(), Load->getAAInfo());
20546 DAG.makeEquivalentMemoryOrdering(Load, ScalarLoad);
20547 return ScalarLoad;
20548 }
20549 }
20550
20551 return SDValue();
20552}
20553
20556 SelectionDAG &DAG) {
20557 SDLoc DL(N);
20558 EVT VT = N->getValueType(0);
20559 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
20560 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
20561
20562 if (VT.isScalableVector())
20563 return SDValue();
20564
20565 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20566 N1Opc == ISD::TRUNCATE) {
20567 SDValue N00 = N0->getOperand(0);
20568 SDValue N10 = N1->getOperand(0);
20569 EVT N00VT = N00.getValueType();
20570 unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();
20571
20572 // Optimize concat_vectors of truncated vectors, where the intermediate
20573 // type is illegal, to avoid said illegality, e.g.,
20574 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
20575 // (v2i16 (truncate (v2i64)))))
20576 // ->
20577 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
20578 // (v4i32 (bitcast (v2i64))),
20579 // <0, 2, 4, 6>)))
20580 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
20581 // on both input and result type, so we might generate worse code.
20582 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
20583 if (N00VT == N10.getValueType() &&
20584 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
20585 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
20586 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
20588 for (size_t i = 0; i < Mask.size(); ++i)
20589 Mask[i] = i * 2;
20590 return DAG.getNode(ISD::TRUNCATE, DL, VT,
20591 DAG.getVectorShuffle(
20592 MidVT, DL,
20593 DAG.getNode(ISD::BITCAST, DL, MidVT, N00),
20594 DAG.getNode(ISD::BITCAST, DL, MidVT, N10), Mask));
20595 }
20596
20597 // Optimize two large shifts and a combine into a single combine and shift
20598 // For AArch64 architectures, sequences like the following:
20599 //
20600 // ushr v0.4s, v0.4s, #20
20601 // ushr v1.4s, v1.4s, #20
20602 // uzp1 v0.8h, v0.8h, v1.8h
20603 //
20604 // Can be optimized to:
20605 //
20606 // uzp2 v0.8h, v0.8h, v1.8h
20607 // ushr v0.8h, v0.8h, #4
20608 //
20609 // This optimization reduces instruction count.
20610 if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
20611 N00->getOperand(1) == N10->getOperand(1)) {
20612 SDValue N000 = N00->getOperand(0);
20613 SDValue N100 = N10->getOperand(0);
20614 uint64_t N001ConstVal = N00->getConstantOperandVal(1),
20615 N101ConstVal = N10->getConstantOperandVal(1),
20616 NScalarSize = N->getValueType(0).getScalarSizeInBits();
20617
20618 if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
20619 N000 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N000);
20620 N100 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N100);
20621 SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, DL, VT, N000, N100);
20622 SDValue NewShiftConstant =
20623 DAG.getConstant(N001ConstVal - NScalarSize, DL, MVT::i32);
20624
20625 return DAG.getNode(AArch64ISD::VLSHR, DL, VT, Uzp, NewShiftConstant);
20626 }
20627 }
20628 }
20629
20630 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
20631 N->getOperand(0).getValueType() == MVT::v2i16 ||
20632 N->getOperand(0).getValueType() == MVT::v2i8) {
20633 EVT SrcVT = N->getOperand(0).getValueType();
20634 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
20635 // loads to prevent having to go through the v4i8 load legalization that
20636 // needs to extend each element into a larger type.
20637 if (N->getNumOperands() % 2 == 0 &&
20638 all_of(N->op_values(), [SrcVT](SDValue V) {
20639 if (V.getValueType() != SrcVT)
20640 return false;
20641 if (V.isUndef())
20642 return true;
20643 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
20644 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
20645 LD->getExtensionType() == ISD::NON_EXTLOAD;
20646 })) {
20647 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
20648 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
20650
20651 for (unsigned i = 0; i < N->getNumOperands(); i++) {
20652 SDValue V = N->getOperand(i);
20653 if (V.isUndef())
20654 Ops.push_back(DAG.getUNDEF(FVT));
20655 else {
20657 SDValue NewLoad = DAG.getLoad(FVT, DL, LD->getChain(),
20658 LD->getBasePtr(), LD->getMemOperand());
20659 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
20660 Ops.push_back(NewLoad);
20661 }
20662 }
20663 return DAG.getBitcast(N->getValueType(0),
20664 DAG.getBuildVector(NVT, DL, Ops));
20665 }
20666 }
20667
20668 // Canonicalise concat_vectors to replace concatenations of truncated nots
20669 // with nots of concatenated truncates. This in some cases allows for multiple
20670 // redundant negations to be eliminated.
20671 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
20672 // (v4i16 (truncate (not (v4i32)))))
20673 // ->
20674 // (not (concat_vectors (v4i16 (truncate (v4i32))),
20675 // (v4i16 (truncate (v4i32)))))
20676 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20677 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
20678 N->isOnlyUserOf(N1.getNode())) {
20679 auto isBitwiseVectorNegate = [](SDValue V) {
20680 return V->getOpcode() == ISD::XOR &&
20681 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
20682 };
20683 SDValue N00 = N0->getOperand(0);
20684 SDValue N10 = N1->getOperand(0);
20685 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
20686 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
20687 return DAG.getNOT(
20688 DL,
20691 N00->getOperand(0)),
20693 N10->getOperand(0))),
20694 VT);
20695 }
20696 }
20697
20698 // Wait till after everything is legalized to try this. That way we have
20699 // legal vector types and such.
20700 if (DCI.isBeforeLegalizeOps())
20701 return SDValue();
20702
20703 // Optimise concat_vectors of two identical binops with a 128-bit destination
20704 // size, combine into an binop of two contacts of the source vectors. eg:
20705 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
20706 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
20707 (DAG.getTargetLoweringInfo().isBinOp(N0Opc) ||
20708 isVectorizedBinOp(N0Opc)) &&
20709 N0->hasOneUse() && N1->hasOneUse()) {
20710 SDValue N00 = N0->getOperand(0);
20711 SDValue N01 = N0->getOperand(1);
20712 SDValue N10 = N1->getOperand(0);
20713 SDValue N11 = N1->getOperand(1);
20714
20715 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
20716 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N00, N10);
20717 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N01, N11);
20718 return DAG.getNode(N0Opc, DL, VT, Concat0, Concat1);
20719 }
20720 }
20721
20722 auto IsRSHRN = [](SDValue Shr) {
20723 if (Shr.getOpcode() != AArch64ISD::VLSHR)
20724 return false;
20725 SDValue Op = Shr.getOperand(0);
20726 EVT VT = Op.getValueType();
20727 unsigned ShtAmt = Shr.getConstantOperandVal(1);
20728 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
20729 return false;
20730
20731 APInt Imm;
20732 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
20733 Imm = APInt(VT.getScalarSizeInBits(),
20734 Op.getOperand(1).getConstantOperandVal(0)
20735 << Op.getOperand(1).getConstantOperandVal(1));
20736 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
20737 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
20738 Imm = APInt(VT.getScalarSizeInBits(),
20739 Op.getOperand(1).getConstantOperandVal(0));
20740 else
20741 return false;
20742
20743 if (Imm != 1ULL << (ShtAmt - 1))
20744 return false;
20745 return true;
20746 };
20747
20748 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
20749 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
20750 ((IsRSHRN(N1) &&
20752 N1.isUndef())) {
20753 SDValue X = N0.getOperand(0).getOperand(0);
20754 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
20755 : N1.getOperand(0).getOperand(0);
20756 EVT BVT =
20757 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
20758 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, DL, BVT, X, Y);
20759 SDValue Add = DAG.getNode(
20760 ISD::ADD, DL, BVT, CC,
20761 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), DL, BVT));
20762 SDValue Shr =
20763 DAG.getNode(AArch64ISD::VLSHR, DL, BVT, Add, N0.getOperand(1));
20764 return Shr;
20765 }
20766
20767 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
20768 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
20769 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
20770 N0.getOperand(1) == N1.getOperand(1)) {
20771 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
20772 DAG.getUNDEF(N0.getValueType()));
20773 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(1),
20774 DAG.getUNDEF(N0.getValueType()));
20775 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, E0, E1);
20776 }
20777
20778 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
20779 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
20780 // canonicalise to that.
20781 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
20782 assert(VT.getScalarSizeInBits() == 64);
20783 return DAG.getNode(AArch64ISD::DUPLANE64, DL, VT, WidenVector(N0, DAG),
20784 DAG.getConstant(0, DL, MVT::i64));
20785 }
20786
20787 // Canonicalise concat_vectors so that the right-hand vector has as few
20788 // bit-casts as possible before its real operation. The primary matching
20789 // destination for these operations will be the narrowing "2" instructions,
20790 // which depend on the operation being performed on this right-hand vector.
20791 // For example,
20792 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
20793 // becomes
20794 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
20795
20796 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
20797 return SDValue();
20798 SDValue RHS = N1->getOperand(0);
20799 MVT RHSTy = RHS.getValueType().getSimpleVT();
20800 // If the RHS is not a vector, this is not the pattern we're looking for.
20801 if (!RHSTy.isVector())
20802 return SDValue();
20803
20804 LLVM_DEBUG(
20805 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
20806
20807 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
20808 RHSTy.getVectorNumElements() * 2);
20809 return DAG.getNode(ISD::BITCAST, DL, VT,
20810 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatTy,
20811 DAG.getNode(ISD::BITCAST, DL, RHSTy, N0),
20812 RHS));
20813}
20814
20815static SDValue
20817 SelectionDAG &DAG) {
20818 if (DCI.isBeforeLegalizeOps())
20819 return SDValue();
20820
20821 EVT VT = N->getValueType(0);
20822 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
20823 return SDValue();
20824
20825 SDValue V = N->getOperand(0);
20826
20827 // NOTE: This combine exists in DAGCombiner, but that version's legality check
20828 // blocks this combine because the non-const case requires custom lowering.
20829 //
20830 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
20831 if (V.getOpcode() == ISD::SPLAT_VECTOR)
20832 if (isa<ConstantSDNode>(V.getOperand(0)))
20833 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
20834
20835 return SDValue();
20836}
20837
20838static SDValue
20840 SelectionDAG &DAG) {
20841 SDLoc DL(N);
20842 SDValue Vec = N->getOperand(0);
20843 SDValue SubVec = N->getOperand(1);
20844 uint64_t IdxVal = N->getConstantOperandVal(2);
20845 EVT VecVT = Vec.getValueType();
20846 EVT SubVT = SubVec.getValueType();
20847
20848 // Promote fixed length vector zeros.
20849 if (VecVT.isScalableVector() && SubVT.isFixedLengthVector() &&
20850 Vec.isUndef() && isZerosVector(SubVec.getNode()))
20851 return VecVT.isInteger() ? DAG.getConstant(0, DL, VecVT)
20852 : DAG.getConstantFP(0, DL, VecVT);
20853
20854 // Only do this for legal fixed vector types.
20855 if (!VecVT.isFixedLengthVector() ||
20856 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
20857 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
20858 return SDValue();
20859
20860 // Ignore widening patterns.
20861 if (IdxVal == 0 && Vec.isUndef())
20862 return SDValue();
20863
20864 // Subvector must be half the width and an "aligned" insertion.
20865 unsigned NumSubElts = SubVT.getVectorNumElements();
20866 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
20867 (IdxVal != 0 && IdxVal != NumSubElts))
20868 return SDValue();
20869
20870 // Fold insert_subvector -> concat_vectors
20871 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
20872 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
20873 SDValue Lo, Hi;
20874 if (IdxVal == 0) {
20875 Lo = SubVec;
20876 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20877 DAG.getVectorIdxConstant(NumSubElts, DL));
20878 } else {
20879 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
20880 DAG.getVectorIdxConstant(0, DL));
20881 Hi = SubVec;
20882 }
20883 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
20884}
20885
20888 SelectionDAG &DAG) {
20889 // Wait until after everything is legalized to try this. That way we have
20890 // legal vector types and such.
20891 if (DCI.isBeforeLegalizeOps())
20892 return SDValue();
20893 // Transform a scalar conversion of a value from a lane extract into a
20894 // lane extract of a vector conversion. E.g., from foo1 to foo2:
20895 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
20896 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
20897 //
20898 // The second form interacts better with instruction selection and the
20899 // register allocator to avoid cross-class register copies that aren't
20900 // coalescable due to a lane reference.
20901
20902 // Check the operand and see if it originates from a lane extract.
20903 SDValue Op1 = N->getOperand(1);
20905 return SDValue();
20906
20907 // Yep, no additional predication needed. Perform the transform.
20908 SDValue IID = N->getOperand(0);
20909 SDValue Shift = N->getOperand(2);
20910 SDValue Vec = Op1.getOperand(0);
20911 SDValue Lane = Op1.getOperand(1);
20912 EVT ResTy = N->getValueType(0);
20913 EVT VecResTy;
20914 SDLoc DL(N);
20915
20916 // The vector width should be 128 bits by the time we get here, even
20917 // if it started as 64 bits (the extract_vector handling will have
20918 // done so). Bail if it is not.
20919 if (Vec.getValueSizeInBits() != 128)
20920 return SDValue();
20921
20922 if (Vec.getValueType() == MVT::v4i32)
20923 VecResTy = MVT::v4f32;
20924 else if (Vec.getValueType() == MVT::v2i64)
20925 VecResTy = MVT::v2f64;
20926 else
20927 return SDValue();
20928
20929 SDValue Convert =
20930 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
20931 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
20932}
20933
20934// AArch64 high-vector "long" operations are formed by performing the non-high
20935// version on an extract_subvector of each operand which gets the high half:
20936//
20937// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
20938//
20939// However, there are cases which don't have an extract_high explicitly, but
20940// have another operation that can be made compatible with one for free. For
20941// example:
20942//
20943// (dupv64 scalar) --> (extract_high (dup128 scalar))
20944//
20945// This routine does the actual conversion of such DUPs, once outer routines
20946// have determined that everything else is in order.
20947// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
20948// similarly here.
20950 MVT VT = N.getSimpleValueType();
20951 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
20952 N.getConstantOperandVal(1) == 0)
20953 N = N.getOperand(0);
20954
20955 switch (N.getOpcode()) {
20956 case AArch64ISD::DUP:
20957 case AArch64ISD::DUPLANE8:
20958 case AArch64ISD::DUPLANE16:
20959 case AArch64ISD::DUPLANE32:
20960 case AArch64ISD::DUPLANE64:
20961 case AArch64ISD::MOVI:
20962 case AArch64ISD::MOVIshift:
20963 case AArch64ISD::MOVIedit:
20964 case AArch64ISD::MOVImsl:
20965 case AArch64ISD::MVNIshift:
20966 case AArch64ISD::MVNImsl:
20967 break;
20968 default:
20969 // FMOV could be supported, but isn't very useful, as it would only occur
20970 // if you passed a bitcast' floating point immediate to an eligible long
20971 // integer op (addl, smull, ...).
20972 return SDValue();
20973 }
20974
20975 if (!VT.is64BitVector())
20976 return SDValue();
20977
20978 SDLoc DL(N);
20979 unsigned NumElems = VT.getVectorNumElements();
20980 if (N.getValueType().is64BitVector()) {
20981 MVT ElementTy = VT.getVectorElementType();
20982 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
20983 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
20984 }
20985
20986 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
20987 DAG.getConstant(NumElems, DL, MVT::i64));
20988}
20989
20991 if (N.getOpcode() == ISD::BITCAST)
20992 N = N.getOperand(0);
20993 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
20994 return false;
20995 if (N.getOperand(0).getValueType().isScalableVector())
20996 return false;
20997 return N.getConstantOperandAPInt(1) ==
20998 N.getOperand(0).getValueType().getVectorNumElements() / 2;
20999}
21000
21001/// Helper structure to keep track of ISD::SET_CC operands.
21007
21008/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
21013
21014/// Helper structure to keep track of SetCC information.
21019
21020/// Helper structure to be able to read SetCC information. If set to
21021/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
21022/// GenericSetCCInfo.
21027
21028/// Check whether or not \p Op is a SET_CC operation, either a generic or
21029/// an
21030/// AArch64 lowered one.
21031/// \p SetCCInfo is filled accordingly.
21032/// \post SetCCInfo is meanginfull only when this function returns true.
21033/// \return True when Op is a kind of SET_CC operation.
21035 // If this is a setcc, this is straight forward.
21036 if (Op.getOpcode() == ISD::SETCC) {
21037 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
21038 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
21039 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
21040 SetCCInfo.IsAArch64 = false;
21041 return true;
21042 }
21043 // Otherwise, check if this is a matching csel instruction.
21044 // In other words:
21045 // - csel 1, 0, cc
21046 // - csel 0, 1, !cc
21047 if (Op.getOpcode() != AArch64ISD::CSEL)
21048 return false;
21049 // Set the information about the operands.
21050 // TODO: we want the operands of the Cmp not the csel
21051 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
21052 SetCCInfo.IsAArch64 = true;
21053 SetCCInfo.Info.AArch64.CC =
21054 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
21055
21056 // Check that the operands matches the constraints:
21057 // (1) Both operands must be constants.
21058 // (2) One must be 1 and the other must be 0.
21059 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
21060 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
21061
21062 // Check (1).
21063 if (!TValue || !FValue)
21064 return false;
21065
21066 // Check (2).
21067 if (!TValue->isOne()) {
21068 // Update the comparison when we are interested in !cc.
21069 std::swap(TValue, FValue);
21070 SetCCInfo.Info.AArch64.CC =
21072 }
21073 return TValue->isOne() && FValue->isZero();
21074}
21075
21076// Returns true if Op is setcc or zext of setcc.
21078 if (isSetCC(Op, Info))
21079 return true;
21080 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
21081 isSetCC(Op->getOperand(0), Info));
21082}
21083
21084// The folding we want to perform is:
21085// (add x, [zext] (setcc cc ...) )
21086// -->
21087// (csel x, (add x, 1), !cc ...)
21088//
21089// The latter will get matched to a CSINC instruction.
21091 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
21092 SDValue LHS = Op->getOperand(0);
21093 SDValue RHS = Op->getOperand(1);
21094 SetCCInfoAndKind InfoAndKind;
21095
21096 // If both operands are a SET_CC, then we don't want to perform this
21097 // folding and create another csel as this results in more instructions
21098 // (and higher register usage).
21099 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
21100 isSetCCOrZExtSetCC(RHS, InfoAndKind))
21101 return SDValue();
21102
21103 // If neither operand is a SET_CC, give up.
21104 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
21105 std::swap(LHS, RHS);
21106 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
21107 return SDValue();
21108 }
21109
21110 // FIXME: This could be generatized to work for FP comparisons.
21111 EVT CmpVT = InfoAndKind.IsAArch64
21112 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
21113 : InfoAndKind.Info.Generic.Opnd0->getValueType();
21114 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
21115 return SDValue();
21116
21117 SDValue CCVal;
21118 SDValue Cmp;
21119 SDLoc DL(Op);
21120 if (InfoAndKind.IsAArch64) {
21121 CCVal = DAG.getConstant(
21123 MVT::i32);
21124 Cmp = *InfoAndKind.Info.AArch64.Cmp;
21125 } else
21126 Cmp = getAArch64Cmp(
21127 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
21128 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
21129 DL);
21130
21131 EVT VT = Op->getValueType(0);
21132 LHS = DAG.getNode(ISD::ADD, DL, VT, RHS, DAG.getConstant(1, DL, VT));
21133 return DAG.getNode(AArch64ISD::CSEL, DL, VT, RHS, LHS, CCVal, Cmp);
21134}
21135
21136// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
21138 EVT VT = N->getValueType(0);
21139 // Only scalar integer and vector types.
21140 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
21141 return SDValue();
21142
21143 SDValue LHS = N->getOperand(0);
21144 SDValue RHS = N->getOperand(1);
21145 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21146 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
21147 return SDValue();
21148
21149 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
21150 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
21151 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
21152 return SDValue();
21153
21154 SDValue Op1 = LHS->getOperand(0);
21155 SDValue Op2 = RHS->getOperand(0);
21156 EVT OpVT1 = Op1.getValueType();
21157 EVT OpVT2 = Op2.getValueType();
21158 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
21159 Op2.getOpcode() != AArch64ISD::UADDV ||
21160 OpVT1.getVectorElementType() != VT)
21161 return SDValue();
21162
21163 SDValue Val1 = Op1.getOperand(0);
21164 SDValue Val2 = Op2.getOperand(0);
21165 EVT ValVT = Val1->getValueType(0);
21166 SDLoc DL(N);
21167 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
21168 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
21169 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
21170 DAG.getConstant(0, DL, MVT::i64));
21171}
21172
21173/// Perform the scalar expression combine in the form of:
21174/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
21175/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
21177 EVT VT = N->getValueType(0);
21178 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
21179 return SDValue();
21180
21181 SDValue LHS = N->getOperand(0);
21182 SDValue RHS = N->getOperand(1);
21183
21184 // Handle commutivity.
21185 if (LHS.getOpcode() != AArch64ISD::CSEL &&
21186 LHS.getOpcode() != AArch64ISD::CSNEG) {
21187 std::swap(LHS, RHS);
21188 if (LHS.getOpcode() != AArch64ISD::CSEL &&
21189 LHS.getOpcode() != AArch64ISD::CSNEG) {
21190 return SDValue();
21191 }
21192 }
21193
21194 if (!LHS.hasOneUse())
21195 return SDValue();
21196
21198 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
21199
21200 // The CSEL should include a const one operand, and the CSNEG should include
21201 // One or NegOne operand.
21202 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
21203 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
21204 if (!CTVal || !CFVal)
21205 return SDValue();
21206
21207 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
21208 (CTVal->isOne() || CFVal->isOne())) &&
21209 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
21210 (CTVal->isOne() || CFVal->isAllOnes())))
21211 return SDValue();
21212
21213 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
21214 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
21215 !CFVal->isOne()) {
21216 std::swap(CTVal, CFVal);
21218 }
21219
21220 SDLoc DL(N);
21221 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
21222 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
21223 !CFVal->isAllOnes()) {
21224 APInt C = -1 * CFVal->getAPIntValue();
21225 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
21226 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
21228 }
21229
21230 // It might be neutral for larger constants, as the immediate need to be
21231 // materialized in a register.
21232 APInt ADDC = CTVal->getAPIntValue();
21233 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21234 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
21235 return SDValue();
21236
21237 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
21238 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
21239 "Unexpected constant value");
21240
21241 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
21242 SDValue CCVal = getCondCode(DAG, AArch64CC);
21243 SDValue Cmp = LHS.getOperand(3);
21244
21245 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
21246}
21247
21248// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
21250 EVT VT = N->getValueType(0);
21251 if (N->getOpcode() != ISD::ADD)
21252 return SDValue();
21253
21254 SDValue Dot = N->getOperand(0);
21255 SDValue A = N->getOperand(1);
21256 // Handle commutivity
21257 auto isZeroDot = [](SDValue Dot) {
21258 return (Dot.getOpcode() == AArch64ISD::UDOT ||
21259 Dot.getOpcode() == AArch64ISD::SDOT) &&
21261 };
21262 if (!isZeroDot(Dot))
21263 std::swap(Dot, A);
21264 if (!isZeroDot(Dot))
21265 return SDValue();
21266
21267 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
21268 Dot.getOperand(2));
21269}
21270
21272 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
21273}
21274
21275// Try to fold
21276//
21277// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
21278//
21279// The folding helps csel to be matched with csneg without generating
21280// redundant neg instruction, which includes negation of the csel expansion
21281// of abs node lowered by lowerABS.
21283 if (!isNegatedInteger(SDValue(N, 0)))
21284 return SDValue();
21285
21286 SDValue CSel = N->getOperand(1);
21287 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
21288 return SDValue();
21289
21290 SDValue N0 = CSel.getOperand(0);
21291 SDValue N1 = CSel.getOperand(1);
21292
21293 // If neither of them are negations, it's not worth the folding as it
21294 // introduces two additional negations while reducing one negation.
21295 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
21296 return SDValue();
21297
21298 SDLoc DL(N);
21299 EVT VT = CSel.getValueType();
21300
21301 SDValue N0N = DAG.getNegative(N0, DL, VT);
21302 SDValue N1N = DAG.getNegative(N1, DL, VT);
21303
21304 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
21305 CSel.getOperand(3));
21306}
21307
21308// The basic add/sub long vector instructions have variants with "2" on the end
21309// which act on the high-half of their inputs. They are normally matched by
21310// patterns like:
21311//
21312// (add (zeroext (extract_high LHS)),
21313// (zeroext (extract_high RHS)))
21314// -> uaddl2 vD, vN, vM
21315//
21316// However, if one of the extracts is something like a duplicate, this
21317// instruction can still be used profitably. This function puts the DAG into a
21318// more appropriate form for those patterns to trigger.
21321 SelectionDAG &DAG = DCI.DAG;
21322 if (DCI.isBeforeLegalizeOps())
21323 return SDValue();
21324
21325 MVT VT = N->getSimpleValueType(0);
21326 if (!VT.is128BitVector()) {
21327 if (N->getOpcode() == ISD::ADD)
21328 return performSetccAddFolding(N, DAG);
21329 return SDValue();
21330 }
21331
21332 // Make sure both branches are extended in the same way.
21333 SDValue LHS = N->getOperand(0);
21334 SDValue RHS = N->getOperand(1);
21335 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
21336 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
21337 LHS.getOpcode() != RHS.getOpcode())
21338 return SDValue();
21339
21340 unsigned ExtType = LHS.getOpcode();
21341
21342 // It's not worth doing if at least one of the inputs isn't already an
21343 // extract, but we don't know which it'll be so we have to try both.
21344 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
21345 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
21346 if (!RHS.getNode())
21347 return SDValue();
21348
21349 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
21350 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
21351 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
21352 if (!LHS.getNode())
21353 return SDValue();
21354
21355 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
21356 }
21357
21358 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
21359}
21360
21361static bool isCMP(SDValue Op) {
21362 return Op.getOpcode() == AArch64ISD::SUBS &&
21363 !Op.getNode()->hasAnyUseOfValue(0);
21364}
21365
21366// (CSEL 1 0 CC Cond) => CC
21367// (CSEL 0 1 CC Cond) => !CC
21368static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
21369 if (Op.getOpcode() != AArch64ISD::CSEL)
21370 return std::nullopt;
21371 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
21372 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
21373 return std::nullopt;
21374 SDValue OpLHS = Op.getOperand(0);
21375 SDValue OpRHS = Op.getOperand(1);
21376 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
21377 return CC;
21378 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
21379 return getInvertedCondCode(CC);
21380
21381 return std::nullopt;
21382}
21383
21384// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
21385// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
21386static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
21387 SDValue CmpOp = Op->getOperand(2);
21388 if (!isCMP(CmpOp))
21389 return SDValue();
21390
21391 if (IsAdd) {
21392 if (!isOneConstant(CmpOp.getOperand(1)))
21393 return SDValue();
21394 } else {
21395 if (!isNullConstant(CmpOp.getOperand(0)))
21396 return SDValue();
21397 }
21398
21399 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
21400 auto CC = getCSETCondCode(CsetOp);
21401 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
21402 return SDValue();
21403
21404 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
21405 Op->getOperand(0), Op->getOperand(1),
21406 CsetOp.getOperand(3));
21407}
21408
21409// (ADC x 0 cond) => (CINC x HS cond)
21411 SDValue LHS = N->getOperand(0);
21412 SDValue RHS = N->getOperand(1);
21413 SDValue Cond = N->getOperand(2);
21414
21415 if (!isNullConstant(RHS))
21416 return SDValue();
21417
21418 EVT VT = N->getValueType(0);
21419 SDLoc DL(N);
21420
21421 // (CINC x cc cond) <=> (CSINC x x !cc cond)
21423 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
21424}
21425
21428 SelectionDAG &DAG) {
21429 SDLoc DL(N);
21430 EVT VT = N->getValueType(0);
21431
21433 (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
21434 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
21435 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
21436 if (Elt0->getOpcode() == ISD::FP_ROUND &&
21437 Elt1->getOpcode() == ISD::FP_ROUND &&
21438 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
21439 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
21440 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
21442 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21443 // Constant index.
21445 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
21446 Elt0->getOperand(0)->getOperand(0) ==
21447 Elt1->getOperand(0)->getOperand(0) &&
21448 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
21449 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
21450 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
21451 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
21452 SDValue HighLanes;
21453 if (Elt2->getOpcode() == ISD::UNDEF &&
21454 Elt3->getOpcode() == ISD::UNDEF) {
21455 HighLanes = DAG.getUNDEF(MVT::v2f32);
21456 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
21457 Elt3->getOpcode() == ISD::FP_ROUND &&
21458 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
21459 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
21460 Elt2->getConstantOperandVal(1) ==
21461 Elt3->getConstantOperandVal(1) &&
21462 Elt2->getOperand(0)->getOpcode() ==
21464 Elt3->getOperand(0)->getOpcode() ==
21466 // Constant index.
21467 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
21468 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
21469 Elt2->getOperand(0)->getOperand(0) ==
21470 Elt3->getOperand(0)->getOperand(0) &&
21471 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
21472 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
21473 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
21474 HighLanes =
21475 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
21476 }
21477 if (HighLanes) {
21478 SDValue DoubleToSingleSticky =
21479 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
21480 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
21481 DoubleToSingleSticky, HighLanes);
21482 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
21483 Elt0->getOperand(1));
21484 }
21485 }
21486 }
21487 }
21488
21489 if (VT == MVT::v2f64) {
21490 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
21491 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
21492 Elt1->getOpcode() == ISD::FP_EXTEND &&
21494 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21495 Elt0->getOperand(0)->getOperand(0) ==
21496 Elt1->getOperand(0)->getOperand(0) &&
21497 // Constant index.
21499 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
21500 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
21501 Elt1->getOperand(0)->getConstantOperandVal(1) &&
21502 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
21503 // ResultType's known minimum vector length.
21504 Elt0->getOperand(0)->getConstantOperandVal(1) %
21506 0) {
21507 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
21508 if (SrcVec.getValueType() == MVT::v4f16 ||
21509 SrcVec.getValueType() == MVT::v4bf16) {
21510 SDValue HalfToSingle =
21511 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
21512 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
21513 SDValue Extract = DAG.getNode(
21515 HalfToSingle, SubvectorIdx);
21516 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
21517 }
21518 }
21519 }
21520
21521 // A build vector of two extracted elements is equivalent to an
21522 // extract subvector where the inner vector is any-extended to the
21523 // extract_vector_elt VT.
21524 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
21525 // (extract_elt_iXX_to_i32 vec Idx+1))
21526 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
21527
21528 // For now, only consider the v2i32 case, which arises as a result of
21529 // legalization.
21530 if (VT != MVT::v2i32)
21531 return SDValue();
21532
21533 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
21534 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
21535 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21536 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21537 // Constant index.
21538 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
21539 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
21540 // Both EXTRACT_VECTOR_ELT from same vector...
21541 Elt0->getOperand(0) == Elt1->getOperand(0) &&
21542 // ... and contiguous. First element's index +1 == second element's index.
21543 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
21544 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
21545 // ResultType's known minimum vector length.
21546 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
21547 SDValue VecToExtend = Elt0->getOperand(0);
21548 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
21549 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
21550 return SDValue();
21551
21552 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
21553
21554 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
21555 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
21556 SubvectorIdx);
21557 }
21558
21559 return SDValue();
21560}
21561
21562// A special combine for the sqdmulh family of instructions.
21563// smin( sra ( mul( sext v0, sext v1 ) ), SHIFT_AMOUNT ),
21564// SATURATING_VAL ) can be reduced to sqdmulh(...)
21566
21567 if (N->getOpcode() != ISD::SMIN)
21568 return SDValue();
21569
21570 EVT DestVT = N->getValueType(0);
21571
21572 if (!DestVT.isVector() || DestVT.getScalarSizeInBits() > 64 ||
21573 DestVT.isScalableVector())
21574 return SDValue();
21575
21576 ConstantSDNode *Clamp = isConstOrConstSplat(N->getOperand(1));
21577
21578 if (!Clamp)
21579 return SDValue();
21580
21581 MVT ScalarType;
21582 unsigned ShiftAmt = 0;
21583 switch (Clamp->getSExtValue()) {
21584 case (1ULL << 15) - 1:
21585 ScalarType = MVT::i16;
21586 ShiftAmt = 16;
21587 break;
21588 case (1ULL << 31) - 1:
21589 ScalarType = MVT::i32;
21590 ShiftAmt = 32;
21591 break;
21592 default:
21593 return SDValue();
21594 }
21595
21596 SDValue Sra = N->getOperand(0);
21597 if (Sra.getOpcode() != ISD::SRA || !Sra.hasOneUse())
21598 return SDValue();
21599
21600 ConstantSDNode *RightShiftVec = isConstOrConstSplat(Sra.getOperand(1));
21601 if (!RightShiftVec)
21602 return SDValue();
21603 unsigned SExtValue = RightShiftVec->getSExtValue();
21604
21605 if (SExtValue != (ShiftAmt - 1))
21606 return SDValue();
21607
21608 SDValue Mul = Sra.getOperand(0);
21609 if (Mul.getOpcode() != ISD::MUL)
21610 return SDValue();
21611
21612 SDValue SExt0 = Mul.getOperand(0);
21613 SDValue SExt1 = Mul.getOperand(1);
21614
21615 if (SExt0.getOpcode() != ISD::SIGN_EXTEND ||
21616 SExt1.getOpcode() != ISD::SIGN_EXTEND)
21617 return SDValue();
21618
21619 EVT SExt0Type = SExt0.getOperand(0).getValueType();
21620 EVT SExt1Type = SExt1.getOperand(0).getValueType();
21621
21622 if (SExt0Type != SExt1Type || SExt0Type.getScalarType() != ScalarType ||
21623 SExt0Type.getFixedSizeInBits() > 128 || !SExt0Type.isPow2VectorType() ||
21624 SExt0Type.getVectorNumElements() == 1)
21625 return SDValue();
21626
21627 SDLoc DL(N);
21628 SDValue V0 = SExt0.getOperand(0);
21629 SDValue V1 = SExt1.getOperand(0);
21630
21631 // Ensure input vectors are extended to legal types
21632 if (SExt0Type.getFixedSizeInBits() < 64) {
21633 unsigned VecNumElements = SExt0Type.getVectorNumElements();
21634 EVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(64 / VecNumElements),
21635 VecNumElements);
21636 V0 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V0);
21637 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V1);
21638 }
21639
21640 SDValue SQDMULH =
21641 DAG.getNode(AArch64ISD::SQDMULH, DL, V0.getValueType(), V0, V1);
21642
21643 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, SQDMULH);
21644}
21645
21647 if (SDValue V = trySQDMULHCombine(N, DAG)) {
21648 return V;
21649 }
21650
21651 return SDValue();
21652}
21653
21656 SDLoc DL(N);
21657 EVT VT = N->getValueType(0);
21658 SDValue N0 = N->getOperand(0);
21659 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
21660 N0.getOpcode() == AArch64ISD::DUP) {
21661 SDValue Op = N0.getOperand(0);
21662 if (VT.getScalarType() == MVT::i32 &&
21663 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
21664 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
21665 return DAG.getNode(N0.getOpcode(), DL, VT, Op);
21666 }
21667
21668 // Performing the following combine produces a preferable form for ISEL.
21669 // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
21671 N0.hasOneUse()) {
21672 SDValue Op = N0.getOperand(0);
21673 SDValue ExtractIndexNode = N0.getOperand(1);
21674 if (!isa<ConstantSDNode>(ExtractIndexNode))
21675 return SDValue();
21676
21677 // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
21678 // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
21679 assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
21680 "Unexpected legalisation result!");
21681
21682 EVT SrcVectorType = Op.getValueType();
21683 // We also assume that SrcVectorType cannot be a V64 (see
21684 // LowerEXTRACT_VECTOR_ELT).
21685 assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
21686 "Unexpected legalisation result!");
21687
21688 unsigned ExtractIndex =
21689 cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
21690 MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
21691
21692 Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
21693 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
21694 DAG.getVectorIdxConstant(ExtractIndex * 2, DL));
21695 }
21696
21697 return SDValue();
21698}
21699
21700// Check an node is an extend or shift operand
21702 unsigned Opcode = N.getOpcode();
21703 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
21704 EVT SrcVT;
21705 if (Opcode == ISD::SIGN_EXTEND_INREG)
21706 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
21707 else
21708 SrcVT = N.getOperand(0).getValueType();
21709
21710 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
21711 } else if (Opcode == ISD::AND) {
21712 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
21713 if (!CSD)
21714 return false;
21715 uint64_t AndMask = CSD->getZExtValue();
21716 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
21717 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
21718 return isa<ConstantSDNode>(N.getOperand(1));
21719 }
21720
21721 return false;
21722}
21723
21724// (N - Y) + Z --> (Z - Y) + N
21725// when N is an extend or shift operand
21727 SelectionDAG &DAG) {
21728 auto IsOneUseExtend = [](SDValue N) {
21729 return N.hasOneUse() && isExtendOrShiftOperand(N);
21730 };
21731
21732 // DAGCombiner will revert the combination when Z is constant cause
21733 // dead loop. So don't enable the combination when Z is constant.
21734 // If Z is one use shift C, we also can't do the optimization.
21735 // It will falling to self infinite loop.
21736 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
21737 return SDValue();
21738
21739 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
21740 return SDValue();
21741
21742 SDValue Shift = SUB.getOperand(0);
21743 if (!IsOneUseExtend(Shift))
21744 return SDValue();
21745
21746 SDLoc DL(N);
21747 EVT VT = N->getValueType(0);
21748
21749 SDValue Y = SUB.getOperand(1);
21750 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
21751 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
21752}
21753
21755 SelectionDAG &DAG) {
21756 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
21757 // commutative.
21758 if (N->getOpcode() != ISD::ADD)
21759 return SDValue();
21760
21761 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
21762 // shifted register is only available for i32 and i64.
21763 EVT VT = N->getValueType(0);
21764 if (VT != MVT::i32 && VT != MVT::i64)
21765 return SDValue();
21766
21767 SDLoc DL(N);
21768 SDValue LHS = N->getOperand(0);
21769 SDValue RHS = N->getOperand(1);
21770
21771 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
21772 return Val;
21773 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
21774 return Val;
21775
21776 uint64_t LHSImm = 0, RHSImm = 0;
21777 // If both operand are shifted by imm and shift amount is not greater than 4
21778 // for one operand, swap LHS and RHS to put operand with smaller shift amount
21779 // on RHS.
21780 //
21781 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
21782 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
21783 // with LSL (shift > 4). For the rest of processors, this is no-op for
21784 // performance or correctness.
21785 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
21786 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
21787 RHSImm > 4 && LHS.hasOneUse())
21788 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
21789
21790 return SDValue();
21791}
21792
21793// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
21794// This reassociates it back to allow the creation of more mls instructions.
21796 if (N->getOpcode() != ISD::SUB)
21797 return SDValue();
21798
21799 SDValue Add = N->getOperand(1);
21800 SDValue X = N->getOperand(0);
21801 if (Add.getOpcode() != ISD::ADD)
21802 return SDValue();
21803
21804 if (!Add.hasOneUse())
21805 return SDValue();
21807 return SDValue();
21808
21809 SDValue M1 = Add.getOperand(0);
21810 SDValue M2 = Add.getOperand(1);
21811 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
21812 M1.getOpcode() != AArch64ISD::UMULL)
21813 return SDValue();
21814 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
21815 M2.getOpcode() != AArch64ISD::UMULL)
21816 return SDValue();
21817
21818 EVT VT = N->getValueType(0);
21819 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
21820 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
21821}
21822
21823// Combine into mla/mls.
21824// This works on the patterns of:
21825// add v1, (mul v2, v3)
21826// sub v1, (mul v2, v3)
21827// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
21828// It will transform the add/sub to a scalable version, so that we can
21829// make use of SVE's MLA/MLS that will be generated for that pattern
21830static SDValue
21832 SelectionDAG &DAG = DCI.DAG;
21833 // Make sure that the types are legal
21834 if (!DCI.isAfterLegalizeDAG())
21835 return SDValue();
21836 // Before using SVE's features, check first if it's available.
21837 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
21838 return SDValue();
21839
21840 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
21841 return SDValue();
21842
21843 if (!N->getValueType(0).isFixedLengthVector())
21844 return SDValue();
21845
21846 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
21847 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21848 return SDValue();
21849
21850 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
21851 return SDValue();
21852
21853 SDValue MulValue = Op1->getOperand(0);
21854 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
21855 return SDValue();
21856
21857 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
21858 return SDValue();
21859
21860 EVT ScalableVT = MulValue.getValueType();
21861 if (!ScalableVT.isScalableVector())
21862 return SDValue();
21863
21864 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
21865 SDValue NewValue =
21866 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
21867 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
21868 };
21869
21870 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
21871 return res;
21872 else if (N->getOpcode() == ISD::ADD)
21873 return performOpt(N->getOperand(1), N->getOperand(0));
21874
21875 return SDValue();
21876}
21877
21878// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
21879// help, for example, to produce ssra from sshr+add.
21881 EVT VT = N->getValueType(0);
21882 if (VT != MVT::i64 ||
21883 DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64))
21884 return SDValue();
21885 SDValue Op0 = N->getOperand(0);
21886 SDValue Op1 = N->getOperand(1);
21887
21888 // At least one of the operands should be an extract, and the other should be
21889 // something that is easy to convert to v1i64 type (in this case a load).
21890 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21891 Op0.getOpcode() != ISD::LOAD)
21892 return SDValue();
21893 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21894 Op1.getOpcode() != ISD::LOAD)
21895 return SDValue();
21896
21897 SDLoc DL(N);
21898 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21899 Op0.getOperand(0).getValueType() == MVT::v1i64) {
21900 Op0 = Op0.getOperand(0);
21901 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
21902 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21903 Op1.getOperand(0).getValueType() == MVT::v1i64) {
21904 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
21905 Op1 = Op1.getOperand(0);
21906 } else
21907 return SDValue();
21908
21909 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
21910 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
21911 DAG.getConstant(0, DL, MVT::i64));
21912}
21913
21916 if (!BV->hasOneUse())
21917 return false;
21918 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
21919 if (!Ld || !Ld->isSimple())
21920 return false;
21921 Loads.push_back(Ld);
21922 return true;
21923 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
21925 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
21926 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
21927 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
21928 return false;
21929 Loads.push_back(Ld);
21930 }
21931 return true;
21932 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
21933 // Try to find a tree of shuffles and concats from how IR shuffles of loads
21934 // are lowered. Note that this only comes up because we do not always visit
21935 // operands before uses. After that is fixed this can be removed and in the
21936 // meantime this is fairly specific to the lowering we expect from IR.
21937 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
21938 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
21939 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
21940 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
21941 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
21942 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
21943 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
21944 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
21945 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
21946 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
21947 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
21948 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
21949 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
21950 B.getOperand(1).getNumOperands() != 4)
21951 return false;
21952 auto SV1 = cast<ShuffleVectorSDNode>(B);
21953 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
21954 int NumElts = B.getValueType().getVectorNumElements();
21955 int NumSubElts = NumElts / 4;
21956 for (int I = 0; I < NumSubElts; I++) {
21957 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
21958 if (SV1->getMaskElt(I) != I ||
21959 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
21960 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
21961 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
21962 return false;
21963 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
21964 if (SV2->getMaskElt(I) != I ||
21965 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
21966 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
21967 return false;
21968 }
21969 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
21970 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
21971 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
21972 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
21973 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
21974 !Ld2->isSimple() || !Ld3->isSimple())
21975 return false;
21976 Loads.push_back(Ld0);
21977 Loads.push_back(Ld1);
21978 Loads.push_back(Ld2);
21979 Loads.push_back(Ld3);
21980 return true;
21981 }
21982 return false;
21983}
21984
21986 SelectionDAG &DAG,
21987 unsigned &NumSubLoads) {
21988 if (!Op0.hasOneUse() || !Op1.hasOneUse())
21989 return false;
21990
21991 SmallVector<LoadSDNode *> Loads0, Loads1;
21992 if (isLoadOrMultipleLoads(Op0, Loads0) &&
21993 isLoadOrMultipleLoads(Op1, Loads1)) {
21994 if (NumSubLoads && Loads0.size() != NumSubLoads)
21995 return false;
21996 NumSubLoads = Loads0.size();
21997 return Loads0.size() == Loads1.size() &&
21998 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
21999 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
22000 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
22002 Size / 8, 1);
22003 });
22004 }
22005
22006 if (Op0.getOpcode() != Op1.getOpcode())
22007 return false;
22008
22009 switch (Op0.getOpcode()) {
22010 case ISD::ADD:
22011 case ISD::SUB:
22013 DAG, NumSubLoads) &&
22015 DAG, NumSubLoads);
22016 case ISD::SIGN_EXTEND:
22017 case ISD::ANY_EXTEND:
22018 case ISD::ZERO_EXTEND:
22019 EVT XVT = Op0.getOperand(0).getValueType();
22020 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
22021 XVT.getScalarSizeInBits() != 32)
22022 return false;
22024 DAG, NumSubLoads);
22025 }
22026 return false;
22027}
22028
22029// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
22030// into a single load of twice the size, that we extract the bottom part and top
22031// part so that the shl can use a shll2 instruction. The two loads in that
22032// example can also be larger trees of instructions, which are identical except
22033// for the leaves which are all loads offset from the LHS, including
22034// buildvectors of multiple loads. For example the RHS tree could be
22035// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
22036// Whilst it can be common for the larger loads to replace LDP instructions
22037// (which doesn't gain anything on it's own), the larger loads can help create
22038// more efficient code, and in buildvectors prevent the need for ld1 lane
22039// inserts which can be slower than normal loads.
22041 EVT VT = N->getValueType(0);
22042 if (!VT.isFixedLengthVector() ||
22043 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
22044 VT.getScalarSizeInBits() != 64))
22045 return SDValue();
22046
22047 SDValue Other = N->getOperand(0);
22048 SDValue Shift = N->getOperand(1);
22049 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
22050 std::swap(Shift, Other);
22051 APInt ShiftAmt;
22052 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
22053 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
22054 return SDValue();
22055
22056 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
22057 !ISD::isExtOpcode(Other.getOpcode()) ||
22058 Shift.getOperand(0).getOperand(0).getValueType() !=
22059 Other.getOperand(0).getValueType() ||
22060 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
22061 return SDValue();
22062
22063 SDValue Op0 = Other.getOperand(0);
22064 SDValue Op1 = Shift.getOperand(0).getOperand(0);
22065
22066 unsigned NumSubLoads = 0;
22067 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
22068 return SDValue();
22069
22070 // Attempt to rule out some unprofitable cases using heuristics (some working
22071 // around suboptimal code generation), notably if the extend not be able to
22072 // use ushll2 instructions as the types are not large enough. Otherwise zip's
22073 // will need to be created which can increase the instruction count.
22074 unsigned NumElts = Op0.getValueType().getVectorNumElements();
22075 unsigned NumSubElts = NumElts / NumSubLoads;
22076 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
22077 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
22078 Op0.getValueType().getSizeInBits() < 128 &&
22080 return SDValue();
22081
22082 // Recreate the tree with the new combined loads.
22083 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
22084 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
22085 EVT DVT =
22087
22088 SmallVector<LoadSDNode *> Loads0, Loads1;
22089 if (isLoadOrMultipleLoads(Op0, Loads0) &&
22090 isLoadOrMultipleLoads(Op1, Loads1)) {
22091 EVT LoadVT = EVT::getVectorVT(
22092 *DAG.getContext(), Op0.getValueType().getScalarType(),
22093 Op0.getValueType().getVectorNumElements() / Loads0.size());
22094 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
22095
22096 SmallVector<SDValue> NewLoads;
22097 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
22098 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
22099 L0->getBasePtr(), L0->getPointerInfo(),
22100 L0->getBaseAlign());
22101 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
22102 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
22103 NewLoads.push_back(Load);
22104 }
22105 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
22106 }
22107
22109 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
22110 Ops.push_back(GenCombinedTree(O0, O1, DAG));
22111 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
22112 };
22113 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
22114
22115 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
22116 int Hi = NumSubElts, Lo = 0;
22117 for (unsigned i = 0; i < NumSubLoads; i++) {
22118 for (unsigned j = 0; j < NumSubElts; j++) {
22119 LowMask[i * NumSubElts + j] = Lo++;
22120 HighMask[i * NumSubElts + j] = Hi++;
22121 }
22122 Lo += NumSubElts;
22123 Hi += NumSubElts;
22124 }
22125 SDLoc DL(N);
22126 SDValue Ext0, Ext1;
22127 // Extract the top and bottom lanes, then extend the result. Possibly extend
22128 // the result then extract the lanes if the two operands match as it produces
22129 // slightly smaller code.
22130 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
22132 NewOp, DAG.getConstant(0, DL, MVT::i64));
22133 SDValue SubH =
22134 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
22135 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
22136 SDValue Extr0 =
22137 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
22138 SDValue Extr1 =
22139 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
22140 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
22141 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
22142 } else {
22144 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
22145 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
22146 DAG.getConstant(0, DL, MVT::i64));
22147 SDValue SubH =
22148 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
22149 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
22150 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
22151 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
22152 }
22153 SDValue NShift =
22154 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
22155 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
22156}
22157
22160 // Try to change sum of two reductions.
22161 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
22162 return Val;
22163 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
22164 return Val;
22165 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
22166 return Val;
22167 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
22168 return Val;
22169 if (SDValue Val = performVectorExtCombine(N, DCI.DAG))
22170 return Val;
22172 return Val;
22173 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
22174 return Val;
22175 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
22176 return Val;
22177 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
22178 return Val;
22179
22180 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
22181 return Val;
22182
22183 return performAddSubLongCombine(N, DCI);
22184}
22185
22186// Massage DAGs which we can use the high-half "long" operations on into
22187// something isel will recognize better. E.g.
22188//
22189// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
22190// (aarch64_neon_umull (extract_high (v2i64 vec)))
22191// (extract_high (v2i64 (dup128 scalar)))))
22192//
22195 SelectionDAG &DAG) {
22196 if (DCI.isBeforeLegalizeOps())
22197 return SDValue();
22198
22199 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
22200 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
22201 assert(LHS.getValueType().is64BitVector() &&
22202 RHS.getValueType().is64BitVector() &&
22203 "unexpected shape for long operation");
22204
22205 // Either node could be a DUP, but it's not worth doing both of them (you'd
22206 // just as well use the non-high version) so look for a corresponding extract
22207 // operation on the other "wing".
22210 if (!RHS.getNode())
22211 return SDValue();
22214 if (!LHS.getNode())
22215 return SDValue();
22216 } else
22217 return SDValue();
22218
22219 if (IID == Intrinsic::not_intrinsic)
22220 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
22221
22222 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
22223 N->getOperand(0), LHS, RHS);
22224}
22225
22226static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
22227 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
22228 unsigned ElemBits = ElemTy.getSizeInBits();
22229
22230 int64_t ShiftAmount;
22231 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
22232 APInt SplatValue, SplatUndef;
22233 unsigned SplatBitSize;
22234 bool HasAnyUndefs;
22235 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
22236 HasAnyUndefs, ElemBits) ||
22237 SplatBitSize != ElemBits)
22238 return SDValue();
22239
22240 ShiftAmount = SplatValue.getSExtValue();
22241 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
22242 ShiftAmount = CVN->getSExtValue();
22243 } else
22244 return SDValue();
22245
22246 // If the shift amount is zero, remove the shift intrinsic.
22247 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
22248 return N->getOperand(1);
22249
22250 unsigned Opcode;
22251 bool IsRightShift;
22252 switch (IID) {
22253 default:
22254 llvm_unreachable("Unknown shift intrinsic");
22255 case Intrinsic::aarch64_neon_sqshl:
22256 Opcode = AArch64ISD::SQSHL_I;
22257 IsRightShift = false;
22258 break;
22259 case Intrinsic::aarch64_neon_uqshl:
22260 Opcode = AArch64ISD::UQSHL_I;
22261 IsRightShift = false;
22262 break;
22263 case Intrinsic::aarch64_neon_srshl:
22264 Opcode = AArch64ISD::SRSHR_I;
22265 IsRightShift = true;
22266 break;
22267 case Intrinsic::aarch64_neon_urshl:
22268 Opcode = AArch64ISD::URSHR_I;
22269 IsRightShift = true;
22270 break;
22271 case Intrinsic::aarch64_neon_sqshlu:
22272 Opcode = AArch64ISD::SQSHLU_I;
22273 IsRightShift = false;
22274 break;
22275 case Intrinsic::aarch64_neon_sshl:
22276 case Intrinsic::aarch64_neon_ushl:
22277 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
22278 // left shift for positive shift amounts. For negative shifts we can use a
22279 // VASHR/VLSHR as appropriate.
22280 if (ShiftAmount < 0) {
22281 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
22282 : AArch64ISD::VLSHR;
22283 ShiftAmount = -ShiftAmount;
22284 } else
22285 Opcode = AArch64ISD::VSHL;
22286 IsRightShift = false;
22287 break;
22288 }
22289
22290 EVT VT = N->getValueType(0);
22291 SDValue Op = N->getOperand(1);
22292 SDLoc DL(N);
22293 if (VT == MVT::i64) {
22294 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op);
22295 VT = MVT::v1i64;
22296 }
22297
22298 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
22299 Op = DAG.getNode(Opcode, DL, VT, Op,
22300 DAG.getSignedConstant(-ShiftAmount, DL, MVT::i32));
22301 if (N->getValueType(0) == MVT::i64)
22302 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
22303 DAG.getConstant(0, DL, MVT::i64));
22304 return Op;
22305 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
22306 Op = DAG.getNode(Opcode, DL, VT, Op,
22307 DAG.getConstant(ShiftAmount, DL, MVT::i32));
22308 if (N->getValueType(0) == MVT::i64)
22309 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
22310 DAG.getConstant(0, DL, MVT::i64));
22311 return Op;
22312 }
22313
22314 return SDValue();
22315}
22316
22317// The CRC32[BH] instructions ignore the high bits of their data operand. Since
22318// the intrinsics must be legal and take an i32, this means there's almost
22319// certainly going to be a zext in the DAG which we can eliminate.
22320static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
22321 SDValue AndN = N->getOperand(2);
22322 if (AndN.getOpcode() != ISD::AND)
22323 return SDValue();
22324
22326 if (!CMask || CMask->getZExtValue() != Mask)
22327 return SDValue();
22328
22329 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
22330 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
22331}
22332
22334 SelectionDAG &DAG) {
22335 SDLoc DL(N);
22336 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
22337 DAG.getNode(Opc, DL, N->getOperand(1).getSimpleValueType(),
22338 N->getOperand(1)),
22339 DAG.getConstant(0, DL, MVT::i64));
22340}
22341
22343 SDLoc DL(N);
22344 SDValue Op1 = N->getOperand(1);
22345 SDValue Op2 = N->getOperand(2);
22346 EVT ScalarTy = Op2.getValueType();
22347 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
22348 ScalarTy = MVT::i32;
22349
22350 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
22351 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
22352 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
22353 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
22354 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
22355 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
22356}
22357
22359 SDLoc DL(N);
22360 SDValue Scalar = N->getOperand(3);
22361 EVT ScalarTy = Scalar.getValueType();
22362
22363 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
22364 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
22365
22366 SDValue Passthru = N->getOperand(1);
22367 SDValue Pred = N->getOperand(2);
22368 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, DL, N->getValueType(0),
22369 Pred, Scalar, Passthru);
22370}
22371
22373 SDLoc DL(N);
22374 LLVMContext &Ctx = *DAG.getContext();
22375 EVT VT = N->getValueType(0);
22376
22377 assert(VT.isScalableVector() && "Expected a scalable vector.");
22378
22379 // Current lowering only supports the SVE-ACLE types.
22381 return SDValue();
22382
22383 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
22384 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
22385 EVT ByteVT =
22386 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
22387
22388 // Convert everything to the domain of EXT (i.e bytes).
22389 SDValue Op0 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(1));
22390 SDValue Op1 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(2));
22391 SDValue Op2 = DAG.getNode(ISD::MUL, DL, MVT::i32, N->getOperand(3),
22392 DAG.getConstant(ElemSize, DL, MVT::i32));
22393
22394 SDValue EXT = DAG.getNode(AArch64ISD::EXT, DL, ByteVT, Op0, Op1, Op2);
22395 return DAG.getNode(ISD::BITCAST, DL, VT, EXT);
22396}
22397
22400 SelectionDAG &DAG) {
22401 if (DCI.isBeforeLegalize())
22402 return SDValue();
22403
22404 SDValue Comparator = N->getOperand(3);
22405 if (Comparator.getOpcode() == AArch64ISD::DUP ||
22406 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
22407 unsigned IID = getIntrinsicID(N);
22408 EVT VT = N->getValueType(0);
22409 EVT CmpVT = N->getOperand(2).getValueType();
22410 SDValue Pred = N->getOperand(1);
22411 SDValue Imm;
22412 SDLoc DL(N);
22413
22414 switch (IID) {
22415 default:
22416 llvm_unreachable("Called with wrong intrinsic!");
22417 break;
22418
22419 // Signed comparisons
22420 case Intrinsic::aarch64_sve_cmpeq_wide:
22421 case Intrinsic::aarch64_sve_cmpne_wide:
22422 case Intrinsic::aarch64_sve_cmpge_wide:
22423 case Intrinsic::aarch64_sve_cmpgt_wide:
22424 case Intrinsic::aarch64_sve_cmplt_wide:
22425 case Intrinsic::aarch64_sve_cmple_wide: {
22426 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
22427 int64_t ImmVal = CN->getSExtValue();
22428 if (ImmVal >= -16 && ImmVal <= 15)
22429 Imm = DAG.getSignedConstant(ImmVal, DL, MVT::i32);
22430 else
22431 return SDValue();
22432 }
22433 break;
22434 }
22435 // Unsigned comparisons
22436 case Intrinsic::aarch64_sve_cmphs_wide:
22437 case Intrinsic::aarch64_sve_cmphi_wide:
22438 case Intrinsic::aarch64_sve_cmplo_wide:
22439 case Intrinsic::aarch64_sve_cmpls_wide: {
22440 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
22441 uint64_t ImmVal = CN->getZExtValue();
22442 if (ImmVal <= 127)
22443 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
22444 else
22445 return SDValue();
22446 }
22447 break;
22448 }
22449 }
22450
22451 if (!Imm)
22452 return SDValue();
22453
22454 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
22455 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
22456 N->getOperand(2), Splat, DAG.getCondCode(CC));
22457 }
22458
22459 return SDValue();
22460}
22461
22464 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22465
22466 SDLoc DL(Op);
22467 assert(Op.getValueType().isScalableVector() &&
22468 TLI.isTypeLegal(Op.getValueType()) &&
22469 "Expected legal scalable vector type!");
22470 assert(Op.getValueType() == Pg.getValueType() &&
22471 "Expected same type for PTEST operands");
22472
22473 // Ensure target specific opcodes are using legal type.
22474 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
22475 SDValue TVal = DAG.getConstant(1, DL, OutVT);
22476 SDValue FVal = DAG.getConstant(0, DL, OutVT);
22477
22478 // Ensure operands have type nxv16i1.
22479 if (Op.getValueType() != MVT::nxv16i1) {
22482 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
22483 else
22484 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
22485 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
22486 }
22487
22488 unsigned PTest = AArch64ISD::PTEST;
22490 PTest = AArch64ISD::PTEST_ANY;
22491 else if (Cond == AArch64CC::FIRST_ACTIVE)
22492 PTest = AArch64ISD::PTEST_FIRST;
22493
22494 // Set condition code (CC) flags.
22495 SDValue Test = DAG.getNode(PTest, DL, MVT::i32, Pg, Op);
22496
22497 // Convert CC to integer based on requested condition.
22498 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
22499 SDValue CC = getCondCode(DAG, getInvertedCondCode(Cond));
22500 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
22501 return DAG.getZExtOrTrunc(Res, DL, VT);
22502}
22503
22505 SelectionDAG &DAG) {
22506 SDLoc DL(N);
22507
22508 SDValue Pred = N->getOperand(1);
22509 SDValue VecToReduce = N->getOperand(2);
22510
22511 // NOTE: The integer reduction's result type is not always linked to the
22512 // operand's element type so we construct it from the intrinsic's result type.
22513 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
22514 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
22515
22516 // SVE reductions set the whole vector register with the first element
22517 // containing the reduction result, which we'll now extract.
22518 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22519 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
22520 Zero);
22521}
22522
22524 SelectionDAG &DAG) {
22525 SDLoc DL(N);
22526
22527 SDValue Pred = N->getOperand(1);
22528 SDValue VecToReduce = N->getOperand(2);
22529
22530 EVT ReduceVT = VecToReduce.getValueType();
22531 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
22532
22533 // SVE reductions set the whole vector register with the first element
22534 // containing the reduction result, which we'll now extract.
22535 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22536 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
22537 Zero);
22538}
22539
22541 SelectionDAG &DAG) {
22542 SDLoc DL(N);
22543
22544 SDValue Pred = N->getOperand(1);
22545 SDValue InitVal = N->getOperand(2);
22546 SDValue VecToReduce = N->getOperand(3);
22547 EVT ReduceVT = VecToReduce.getValueType();
22548
22549 // Ordered reductions use the first lane of the result vector as the
22550 // reduction's initial value.
22551 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22552 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
22553 DAG.getUNDEF(ReduceVT), InitVal, Zero);
22554
22555 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
22556
22557 // SVE reductions set the whole vector register with the first element
22558 // containing the reduction result, which we'll now extract.
22559 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
22560 Zero);
22561}
22562
22564 SelectionDAG &DAG) {
22565 if (N->getValueType(0) != MVT::i16)
22566 return SDValue();
22567
22568 SDLoc DL(N);
22569 SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1));
22570 SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT);
22571 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast);
22572}
22573
22574// If a merged operation has no inactive lanes we can relax it to a predicated
22575// or unpredicated operation, which potentially allows better isel (perhaps
22576// using immediate forms) or relaxing register reuse requirements.
22578 SelectionDAG &DAG, bool UnpredOp = false,
22579 bool SwapOperands = false) {
22580 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
22581 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
22582 SDValue Pg = N->getOperand(1);
22583 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
22584 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
22585
22586 // ISD way to specify an all active predicate.
22587 if (isAllActivePredicate(DAG, Pg)) {
22588 if (UnpredOp)
22589 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
22590
22591 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
22592 }
22593
22594 // FUTURE: SplatVector(true)
22595 return SDValue();
22596}
22597
22598static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG) {
22599 SDLoc DL(N);
22600 EVT VT = N->getValueType(0);
22601 SDValue Op1 = N->getOperand(1);
22602 SDValue Op2 = N->getOperand(2);
22603 SDValue Op3 = N->getOperand(3);
22604
22605 switch (IID) {
22606 default:
22607 llvm_unreachable("Called with wrong intrinsic!");
22608 case Intrinsic::aarch64_sve_bsl:
22609 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2);
22610 case Intrinsic::aarch64_sve_bsl1n:
22611 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, DAG.getNOT(DL, Op1, VT),
22612 Op2);
22613 case Intrinsic::aarch64_sve_bsl2n:
22614 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1,
22615 DAG.getNOT(DL, Op2, VT));
22616 case Intrinsic::aarch64_sve_nbsl:
22617 return DAG.getNOT(DL, DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2),
22618 VT);
22619 }
22620}
22621
22624 const AArch64Subtarget *Subtarget) {
22625 SelectionDAG &DAG = DCI.DAG;
22626 unsigned IID = getIntrinsicID(N);
22627 switch (IID) {
22628 default:
22629 break;
22630 case Intrinsic::aarch64_neon_vcvtfxs2fp:
22631 case Intrinsic::aarch64_neon_vcvtfxu2fp:
22632 return tryCombineFixedPointConvert(N, DCI, DAG);
22633 case Intrinsic::aarch64_neon_saddv:
22634 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
22635 case Intrinsic::aarch64_neon_uaddv:
22636 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
22637 case Intrinsic::aarch64_neon_sminv:
22638 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
22639 case Intrinsic::aarch64_neon_uminv:
22640 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
22641 case Intrinsic::aarch64_neon_smaxv:
22642 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
22643 case Intrinsic::aarch64_neon_umaxv:
22644 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
22645 case Intrinsic::aarch64_neon_fmax:
22646 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
22647 N->getOperand(1), N->getOperand(2));
22648 case Intrinsic::aarch64_neon_fmin:
22649 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
22650 N->getOperand(1), N->getOperand(2));
22651 case Intrinsic::aarch64_neon_fmaxnm:
22652 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
22653 N->getOperand(1), N->getOperand(2));
22654 case Intrinsic::aarch64_neon_fminnm:
22655 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
22656 N->getOperand(1), N->getOperand(2));
22657 case Intrinsic::aarch64_neon_smull:
22658 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
22659 N->getOperand(1), N->getOperand(2));
22660 case Intrinsic::aarch64_neon_umull:
22661 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
22662 N->getOperand(1), N->getOperand(2));
22663 case Intrinsic::aarch64_neon_pmull:
22664 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
22665 N->getOperand(1), N->getOperand(2));
22666 case Intrinsic::aarch64_neon_sqdmull:
22667 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
22668 case Intrinsic::aarch64_neon_sqshl:
22669 case Intrinsic::aarch64_neon_uqshl:
22670 case Intrinsic::aarch64_neon_sqshlu:
22671 case Intrinsic::aarch64_neon_srshl:
22672 case Intrinsic::aarch64_neon_urshl:
22673 case Intrinsic::aarch64_neon_sshl:
22674 case Intrinsic::aarch64_neon_ushl:
22675 return tryCombineShiftImm(IID, N, DAG);
22676 case Intrinsic::aarch64_neon_sabd:
22677 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
22678 N->getOperand(1), N->getOperand(2));
22679 case Intrinsic::aarch64_neon_uabd:
22680 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
22681 N->getOperand(1), N->getOperand(2));
22682 case Intrinsic::aarch64_neon_fcvtzs:
22683 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG);
22684 case Intrinsic::aarch64_neon_fcvtzu:
22685 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZU_HALF, DAG);
22686 case Intrinsic::aarch64_neon_fcvtas:
22687 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAS_HALF, DAG);
22688 case Intrinsic::aarch64_neon_fcvtau:
22689 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAU_HALF, DAG);
22690 case Intrinsic::aarch64_neon_fcvtms:
22691 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMS_HALF, DAG);
22692 case Intrinsic::aarch64_neon_fcvtmu:
22693 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMU_HALF, DAG);
22694 case Intrinsic::aarch64_neon_fcvtns:
22695 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNS_HALF, DAG);
22696 case Intrinsic::aarch64_neon_fcvtnu:
22697 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNU_HALF, DAG);
22698 case Intrinsic::aarch64_neon_fcvtps:
22699 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPS_HALF, DAG);
22700 case Intrinsic::aarch64_neon_fcvtpu:
22701 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPU_HALF, DAG);
22702 case Intrinsic::aarch64_crc32b:
22703 case Intrinsic::aarch64_crc32cb:
22704 return tryCombineCRC32(0xff, N, DAG);
22705 case Intrinsic::aarch64_crc32h:
22706 case Intrinsic::aarch64_crc32ch:
22707 return tryCombineCRC32(0xffff, N, DAG);
22708 case Intrinsic::aarch64_sve_saddv:
22709 // There is no i64 version of SADDV because the sign is irrelevant.
22710 if (N->getOperand(2).getValueType().getVectorElementType() == MVT::i64)
22711 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
22712 else
22713 return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
22714 case Intrinsic::aarch64_sve_uaddv:
22715 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
22716 case Intrinsic::aarch64_sve_smaxv:
22717 return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
22718 case Intrinsic::aarch64_sve_umaxv:
22719 return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
22720 case Intrinsic::aarch64_sve_sminv:
22721 return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
22722 case Intrinsic::aarch64_sve_uminv:
22723 return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
22724 case Intrinsic::aarch64_sve_orv:
22725 return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
22726 case Intrinsic::aarch64_sve_eorv:
22727 return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
22728 case Intrinsic::aarch64_sve_andv:
22729 return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
22730 case Intrinsic::aarch64_sve_index:
22731 return LowerSVEIntrinsicIndex(N, DAG);
22732 case Intrinsic::aarch64_sve_dup:
22733 return LowerSVEIntrinsicDUP(N, DAG);
22734 case Intrinsic::aarch64_sve_dup_x:
22735 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
22736 N->getOperand(1));
22737 case Intrinsic::aarch64_sve_ext:
22738 return LowerSVEIntrinsicEXT(N, DAG);
22739 case Intrinsic::aarch64_sve_mul_u:
22740 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
22741 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22742 case Intrinsic::aarch64_sve_smulh_u:
22743 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
22744 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22745 case Intrinsic::aarch64_sve_umulh_u:
22746 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
22747 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22748 case Intrinsic::aarch64_sve_smin_u:
22749 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
22750 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22751 case Intrinsic::aarch64_sve_umin_u:
22752 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
22753 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22754 case Intrinsic::aarch64_sve_smax_u:
22755 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
22756 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22757 case Intrinsic::aarch64_sve_umax_u:
22758 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
22759 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22760 case Intrinsic::aarch64_sve_lsl_u:
22761 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
22762 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22763 case Intrinsic::aarch64_sve_lsr_u:
22764 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
22765 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22766 case Intrinsic::aarch64_sve_asr_u:
22767 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
22768 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22769 case Intrinsic::aarch64_sve_fadd_u:
22770 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
22771 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22772 case Intrinsic::aarch64_sve_fdiv_u:
22773 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
22774 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22775 case Intrinsic::aarch64_sve_fmax_u:
22776 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
22777 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22778 case Intrinsic::aarch64_sve_fmaxnm_u:
22779 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
22780 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22781 case Intrinsic::aarch64_sve_fmla_u:
22782 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
22783 N->getOperand(1), N->getOperand(3), N->getOperand(4),
22784 N->getOperand(2));
22785 case Intrinsic::aarch64_sve_fmin_u:
22786 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
22787 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22788 case Intrinsic::aarch64_sve_fminnm_u:
22789 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
22790 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22791 case Intrinsic::aarch64_sve_fmul_u:
22792 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
22793 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22794 case Intrinsic::aarch64_sve_fsub_u:
22795 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
22796 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22797 case Intrinsic::aarch64_sve_add_u:
22798 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
22799 N->getOperand(3));
22800 case Intrinsic::aarch64_sve_sub_u:
22801 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
22802 N->getOperand(3));
22803 case Intrinsic::aarch64_sve_subr:
22804 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
22805 case Intrinsic::aarch64_sve_and_u:
22806 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
22807 N->getOperand(3));
22808 case Intrinsic::aarch64_sve_bic_u:
22809 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
22810 N->getOperand(2), N->getOperand(3));
22811 case Intrinsic::aarch64_sve_saddwb:
22812 return DAG.getNode(AArch64ISD::SADDWB, SDLoc(N), N->getValueType(0),
22813 N->getOperand(1), N->getOperand(2));
22814 case Intrinsic::aarch64_sve_saddwt:
22815 return DAG.getNode(AArch64ISD::SADDWT, SDLoc(N), N->getValueType(0),
22816 N->getOperand(1), N->getOperand(2));
22817 case Intrinsic::aarch64_sve_uaddwb:
22818 return DAG.getNode(AArch64ISD::UADDWB, SDLoc(N), N->getValueType(0),
22819 N->getOperand(1), N->getOperand(2));
22820 case Intrinsic::aarch64_sve_uaddwt:
22821 return DAG.getNode(AArch64ISD::UADDWT, SDLoc(N), N->getValueType(0),
22822 N->getOperand(1), N->getOperand(2));
22823 case Intrinsic::aarch64_sve_eor_u:
22824 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
22825 N->getOperand(3));
22826 case Intrinsic::aarch64_sve_orr_u:
22827 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
22828 N->getOperand(3));
22829 case Intrinsic::aarch64_sve_sabd_u:
22830 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
22831 N->getOperand(2), N->getOperand(3));
22832 case Intrinsic::aarch64_sve_uabd_u:
22833 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
22834 N->getOperand(2), N->getOperand(3));
22835 case Intrinsic::aarch64_sve_sdiv_u:
22836 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
22837 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22838 case Intrinsic::aarch64_sve_udiv_u:
22839 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
22840 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22841 case Intrinsic::aarch64_sve_sqadd:
22842 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
22843 case Intrinsic::aarch64_sve_sqsub_u:
22844 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
22845 N->getOperand(2), N->getOperand(3));
22846 case Intrinsic::aarch64_sve_uqadd:
22847 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
22848 case Intrinsic::aarch64_sve_uqsub_u:
22849 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
22850 N->getOperand(2), N->getOperand(3));
22851 case Intrinsic::aarch64_sve_sqadd_x:
22852 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
22853 N->getOperand(1), N->getOperand(2));
22854 case Intrinsic::aarch64_sve_sqsub_x:
22855 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
22856 N->getOperand(1), N->getOperand(2));
22857 case Intrinsic::aarch64_sve_uqadd_x:
22858 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
22859 N->getOperand(1), N->getOperand(2));
22860 case Intrinsic::aarch64_sve_uqsub_x:
22861 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
22862 N->getOperand(1), N->getOperand(2));
22863 case Intrinsic::aarch64_sve_asrd:
22864 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
22865 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22866 case Intrinsic::aarch64_sve_cmphs:
22867 if (!N->getOperand(2).getValueType().isFloatingPoint())
22868 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22869 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22870 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
22871 break;
22872 case Intrinsic::aarch64_sve_cmphi:
22873 if (!N->getOperand(2).getValueType().isFloatingPoint())
22874 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22875 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22876 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
22877 break;
22878 case Intrinsic::aarch64_sve_fcmpge:
22879 case Intrinsic::aarch64_sve_cmpge:
22880 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22881 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22882 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
22883 break;
22884 case Intrinsic::aarch64_sve_fcmpgt:
22885 case Intrinsic::aarch64_sve_cmpgt:
22886 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22887 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22888 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
22889 break;
22890 case Intrinsic::aarch64_sve_fcmpeq:
22891 case Intrinsic::aarch64_sve_cmpeq:
22892 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22893 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22894 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
22895 break;
22896 case Intrinsic::aarch64_sve_fcmpne:
22897 case Intrinsic::aarch64_sve_cmpne:
22898 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22899 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22900 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
22901 break;
22902 case Intrinsic::aarch64_sve_fcmpuo:
22903 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
22904 N->getValueType(0), N->getOperand(1), N->getOperand(2),
22905 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
22906 break;
22907 case Intrinsic::aarch64_sve_fadda:
22908 return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
22909 case Intrinsic::aarch64_sve_faddv:
22910 return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
22911 case Intrinsic::aarch64_sve_fmaxnmv:
22912 return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
22913 case Intrinsic::aarch64_sve_fmaxv:
22914 return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
22915 case Intrinsic::aarch64_sve_fminnmv:
22916 return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
22917 case Intrinsic::aarch64_sve_fminv:
22918 return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
22919 case Intrinsic::aarch64_sve_sel:
22920 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
22921 N->getOperand(1), N->getOperand(2), N->getOperand(3));
22922 case Intrinsic::aarch64_sve_cmpeq_wide:
22923 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
22924 case Intrinsic::aarch64_sve_cmpne_wide:
22925 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
22926 case Intrinsic::aarch64_sve_cmpge_wide:
22927 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
22928 case Intrinsic::aarch64_sve_cmpgt_wide:
22929 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
22930 case Intrinsic::aarch64_sve_cmplt_wide:
22931 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
22932 case Intrinsic::aarch64_sve_cmple_wide:
22933 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
22934 case Intrinsic::aarch64_sve_cmphs_wide:
22935 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
22936 case Intrinsic::aarch64_sve_cmphi_wide:
22937 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
22938 case Intrinsic::aarch64_sve_cmplo_wide:
22939 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
22940 case Intrinsic::aarch64_sve_cmpls_wide:
22941 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
22942 case Intrinsic::aarch64_sve_ptest_any:
22943 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22945 case Intrinsic::aarch64_sve_ptest_first:
22946 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22948 case Intrinsic::aarch64_sve_ptest_last:
22949 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
22951 case Intrinsic::aarch64_sve_whilelo:
22952 return DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, SDLoc(N), N->getValueType(0),
22953 N->getOperand(1), N->getOperand(2));
22954 case Intrinsic::aarch64_sve_bsl:
22955 case Intrinsic::aarch64_sve_bsl1n:
22956 case Intrinsic::aarch64_sve_bsl2n:
22957 case Intrinsic::aarch64_sve_nbsl:
22958 return combineSVEBitSel(IID, N, DAG);
22959 }
22960 return SDValue();
22961}
22962
22963static bool isCheapToExtend(const SDValue &N) {
22964 unsigned OC = N->getOpcode();
22965 return OC == ISD::LOAD || OC == ISD::MLOAD ||
22967}
22968
22969static SDValue
22971 SelectionDAG &DAG) {
22972 // If we have (sext (setcc A B)) and A and B are cheap to extend,
22973 // we can move the sext into the arguments and have the same result. For
22974 // example, if A and B are both loads, we can make those extending loads and
22975 // avoid an extra instruction. This pattern appears often in VLS code
22976 // generation where the inputs to the setcc have a different size to the
22977 // instruction that wants to use the result of the setcc.
22978 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
22979 N->getOperand(0)->getOpcode() == ISD::SETCC);
22980 const SDValue SetCC = N->getOperand(0);
22981
22982 const SDValue CCOp0 = SetCC.getOperand(0);
22983 const SDValue CCOp1 = SetCC.getOperand(1);
22984 if (!CCOp0->getValueType(0).isInteger() ||
22985 !CCOp1->getValueType(0).isInteger())
22986 return SDValue();
22987
22988 ISD::CondCode Code =
22989 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
22990
22991 ISD::NodeType ExtType =
22992 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22993
22994 if (isCheapToExtend(SetCC.getOperand(0)) &&
22995 isCheapToExtend(SetCC.getOperand(1))) {
22996 const SDValue Ext1 =
22997 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
22998 const SDValue Ext2 =
22999 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
23000
23001 return DAG.getSetCC(
23002 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
23003 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
23004 }
23005
23006 return SDValue();
23007}
23008
23009// Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255)
23010// This comes from interleaved vectorization. It is performed late to capture
23011// uitofp converts too.
23013 SelectionDAG &DAG) {
23014 EVT VT = N->getValueType(0);
23015 if ((VT != MVT::v4i32 && VT != MVT::v8i16) ||
23016 N->getOpcode() != ISD::ZERO_EXTEND ||
23017 N->getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
23018 return SDValue();
23019
23020 unsigned ExtOffset = N->getOperand(0).getConstantOperandVal(1);
23021 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
23022 return SDValue();
23023
23024 EVT InVT = N->getOperand(0).getOperand(0).getValueType();
23025 auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0).getOperand(0));
23026 if (!Shuffle ||
23027 InVT.getVectorNumElements() != VT.getVectorNumElements() * 2 ||
23028 InVT.getScalarSizeInBits() * 2 != VT.getScalarSizeInBits())
23029 return SDValue();
23030
23031 unsigned Idx;
23033 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements()), 4, Idx);
23034 // An undef interleave shuffle can come up after other canonicalizations,
23035 // where the shuffle has been converted to
23036 // zext(extract(shuffle b, undef, [u,u,0,4]))
23037 bool IsUndefDeInterleave = false;
23038 if (!IsDeInterleave)
23039 IsUndefDeInterleave =
23040 Shuffle->getOperand(1).isUndef() &&
23041 all_of(
23042 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements() / 2),
23043 [](int M) { return M < 0; }) &&
23045 Shuffle->getMask().slice(ExtOffset + VT.getVectorNumElements() / 2,
23046 VT.getVectorNumElements() / 2),
23047 4, Idx);
23048 if ((!IsDeInterleave && !IsUndefDeInterleave) || Idx >= 4)
23049 return SDValue();
23050 SDLoc DL(N);
23051 SDValue BC1 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
23052 Shuffle->getOperand(IsUndefDeInterleave ? 1 : 0));
23053 SDValue BC2 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
23054 Shuffle->getOperand(IsUndefDeInterleave ? 0 : 1));
23055 SDValue UZP = DAG.getNode(Idx < 2 ? AArch64ISD::UZP1 : AArch64ISD::UZP2, DL,
23056 VT, BC1, BC2);
23057 if ((Idx & 1) == 1)
23058 UZP = DAG.getNode(ISD::SRL, DL, VT, UZP,
23059 DAG.getConstant(InVT.getScalarSizeInBits(), DL, VT));
23060 return DAG.getNode(
23061 ISD::AND, DL, VT, UZP,
23062 DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT));
23063}
23064
23065// This comes up similar to the above when lowering deinterleaving shuffles from
23066// zexts. We have legalized the operations in the generally case to
23067// zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if
23068// the extract is to the low half and the uzp is uzp1. There would be an extra
23069// shift if the uzp was uzp2 to grab the upper half. Due to the combine above
23070// there could also be an existing and / shift that can be combined in, either
23071// before of after the extract.
23073 EVT VT = N->getValueType(0);
23074 if (N->getOpcode() != ISD::ZERO_EXTEND ||
23075 (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16))
23076 return SDValue();
23077
23078 SDValue Op = N->getOperand(0);
23079 unsigned ExtOffset = (unsigned)-1;
23080 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23081 ExtOffset = Op.getConstantOperandVal(1);
23082 Op = Op.getOperand(0);
23083 }
23084
23085 unsigned Shift = 0;
23087 Op.getValueType().getScalarSizeInBits());
23088
23089 if (Op.getOpcode() == AArch64ISD::VLSHR) {
23090 Shift = Op.getConstantOperandVal(1);
23091 Op = Op.getOperand(0);
23092 Mask = Mask.lshr(Shift);
23093 }
23094 if (Op.getOpcode() == ISD::AND &&
23095 ISD::isConstantSplatVector(Op.getOperand(1).getNode(), Mask)) {
23096 Op = Op.getOperand(0);
23097 Mask = Mask.zext(VT.getScalarSizeInBits());
23098 } else if (Op.getOpcode() == AArch64ISD::BICi) {
23099 Mask = ~APInt(Op.getValueType().getScalarSizeInBits(),
23100 Op.getConstantOperandVal(1) << Op.getConstantOperandVal(2));
23101 Mask = Mask.zext(VT.getScalarSizeInBits());
23102 Op = Op.getOperand(0);
23103 }
23104
23105 if (ExtOffset == (unsigned)-1) {
23106 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23107 ExtOffset = Op.getConstantOperandVal(1);
23108 Op = Op.getOperand(0);
23109 } else
23110 return SDValue();
23111 }
23112 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
23113 return SDValue();
23114
23115 if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2)
23116 return SDValue();
23117 if (Op.getOpcode() == AArch64ISD::UZP2)
23118 Shift += VT.getScalarSizeInBits() / 2;
23119
23120 SDLoc DL(N);
23121 SDValue BC = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
23122 Op.getOperand(ExtOffset == 0 ? 0 : 1));
23123 if (Shift != 0)
23124 BC = DAG.getNode(AArch64ISD::VLSHR, DL, VT, BC,
23125 DAG.getConstant(Shift, DL, MVT::i32));
23126 return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
23127}
23128
23131 SelectionDAG &DAG) {
23132 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
23133 // we can convert that DUP into another extract_high (of a bigger DUP), which
23134 // helps the backend to decide that an sabdl2 would be useful, saving a real
23135 // extract_high operation.
23136 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
23137 N->getOperand(0).getValueType().is64BitVector() &&
23138 (N->getOperand(0).getOpcode() == ISD::ABDU ||
23139 N->getOperand(0).getOpcode() == ISD::ABDS)) {
23140 SDNode *ABDNode = N->getOperand(0).getNode();
23141 SDValue NewABD =
23143 if (!NewABD.getNode())
23144 return SDValue();
23145
23146 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
23147 }
23148
23150 return R;
23151 if (SDValue R = performZExtUZPCombine(N, DAG))
23152 return R;
23153
23154 if (N->getValueType(0).isFixedLengthVector() &&
23155 N->getOpcode() == ISD::SIGN_EXTEND &&
23156 N->getOperand(0)->getOpcode() == ISD::SETCC)
23157 return performSignExtendSetCCCombine(N, DCI, DAG);
23158
23159 // If we see (any_extend (bswap ...)) with bswap returning an i16, we know
23160 // that the top half of the result register must be unused, due to the
23161 // any_extend. This means that we can replace this pattern with (rev16
23162 // (any_extend ...)). This saves a machine instruction compared to (lsr (rev
23163 // ...)), which is what this pattern would otherwise be lowered to.
23164 // Only apply this optimisation if any_extend in original pattern to i32 or
23165 // i64, because this type will become the input type to REV16 in the new
23166 // pattern, so must be a legitimate REV16 input type.
23167 SDValue Bswap = N->getOperand(0);
23168 if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
23169 Bswap.getValueType() == MVT::i16 &&
23170 (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {
23171 SDLoc DL(N);
23172 SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0),
23173 Bswap->getOperand(0));
23174 return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),
23175 NewAnyExtend);
23176 }
23177
23178 return SDValue();
23179}
23180
23182 SDValue SplatVal, unsigned NumVecElts) {
23183 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
23184 Align OrigAlignment = St.getAlign();
23185 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
23186
23187 // Create scalar stores. This is at least as good as the code sequence for a
23188 // split unaligned store which is a dup.s, ext.b, and two stores.
23189 // Most of the time the three stores should be replaced by store pair
23190 // instructions (stp).
23191 SDLoc DL(&St);
23192 SDValue BasePtr = St.getBasePtr();
23193 uint64_t BaseOffset = 0;
23194
23195 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
23196 SDValue NewST1 =
23197 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
23198 OrigAlignment, St.getMemOperand()->getFlags());
23199
23200 // As this in ISel, we will not merge this add which may degrade results.
23201 if (BasePtr->getOpcode() == ISD::ADD &&
23202 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
23203 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
23204 BasePtr = BasePtr->getOperand(0);
23205 }
23206
23207 unsigned Offset = EltOffset;
23208 while (--NumVecElts) {
23209 Align Alignment = commonAlignment(OrigAlignment, Offset);
23210 SDValue OffsetPtr =
23211 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
23212 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
23213 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
23214 PtrInfo.getWithOffset(Offset), Alignment,
23215 St.getMemOperand()->getFlags());
23216 Offset += EltOffset;
23217 }
23218 return NewST1;
23219}
23220
23221// Returns an SVE type that ContentTy can be trivially sign or zero extended
23222// into.
23223static MVT getSVEContainerType(EVT ContentTy) {
23224 assert(ContentTy.isSimple() && "No SVE containers for extended types");
23225
23226 switch (ContentTy.getSimpleVT().SimpleTy) {
23227 default:
23228 llvm_unreachable("No known SVE container for this MVT type");
23229 case MVT::nxv2i8:
23230 case MVT::nxv2i16:
23231 case MVT::nxv2i32:
23232 case MVT::nxv2i64:
23233 case MVT::nxv2f32:
23234 case MVT::nxv2f64:
23235 return MVT::nxv2i64;
23236 case MVT::nxv4i8:
23237 case MVT::nxv4i16:
23238 case MVT::nxv4i32:
23239 case MVT::nxv4f32:
23240 return MVT::nxv4i32;
23241 case MVT::nxv8i8:
23242 case MVT::nxv8i16:
23243 case MVT::nxv8f16:
23244 case MVT::nxv8bf16:
23245 return MVT::nxv8i16;
23246 case MVT::nxv16i8:
23247 return MVT::nxv16i8;
23248 }
23249}
23250
23252 SDLoc DL(N);
23253 EVT VT = N->getValueType(0);
23254
23256 return SDValue();
23257
23258 EVT ContainerVT = VT;
23259 if (ContainerVT.isInteger())
23260 ContainerVT = getSVEContainerType(ContainerVT);
23261
23262 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
23263 SDValue Ops[] = { N->getOperand(0), // Chain
23264 N->getOperand(2), // Pg
23265 N->getOperand(3), // Base
23266 DAG.getValueType(VT) };
23267
23268 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
23269 SDValue LoadChain = SDValue(Load.getNode(), 1);
23270
23271 if (ContainerVT.isInteger() && (VT != ContainerVT))
23272 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
23273
23274 return DAG.getMergeValues({ Load, LoadChain }, DL);
23275}
23276
23278 SDLoc DL(N);
23279 EVT VT = N->getValueType(0);
23280 EVT PtrTy = N->getOperand(3).getValueType();
23281
23282 EVT LoadVT = VT;
23283 if (VT.isFloatingPoint())
23284 LoadVT = VT.changeTypeToInteger();
23285
23286 auto *MINode = cast<MemIntrinsicSDNode>(N);
23287 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
23288 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
23289 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
23290 MINode->getOperand(2), PassThru,
23291 MINode->getMemoryVT(), MINode->getMemOperand(),
23293
23294 if (VT.isFloatingPoint()) {
23295 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
23296 return DAG.getMergeValues(Ops, DL);
23297 }
23298
23299 return L;
23300}
23301
23302template <unsigned Opcode>
23304 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
23305 Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
23306 "Unsupported opcode.");
23307 SDLoc DL(N);
23308 EVT VT = N->getValueType(0);
23309
23310 EVT LoadVT = VT;
23311 if (VT.isFloatingPoint())
23312 LoadVT = VT.changeTypeToInteger();
23313
23314 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
23315 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
23316 SDValue LoadChain = SDValue(Load.getNode(), 1);
23317
23318 if (VT.isFloatingPoint())
23319 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
23320
23321 return DAG.getMergeValues({Load, LoadChain}, DL);
23322}
23323
23325 SDLoc DL(N);
23326 SDValue Data = N->getOperand(2);
23327 EVT DataVT = Data.getValueType();
23328 EVT HwSrcVt = getSVEContainerType(DataVT);
23329 SDValue InputVT = DAG.getValueType(DataVT);
23330
23331 if (DataVT.isFloatingPoint())
23332 InputVT = DAG.getValueType(HwSrcVt);
23333
23334 SDValue SrcNew;
23335 if (Data.getValueType().isFloatingPoint())
23336 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
23337 else
23338 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
23339
23340 SDValue Ops[] = { N->getOperand(0), // Chain
23341 SrcNew,
23342 N->getOperand(4), // Base
23343 N->getOperand(3), // Pg
23344 InputVT
23345 };
23346
23347 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
23348}
23349
23351 SDLoc DL(N);
23352
23353 SDValue Data = N->getOperand(2);
23354 EVT DataVT = Data.getValueType();
23355 EVT PtrTy = N->getOperand(4).getValueType();
23356
23357 if (DataVT.isFloatingPoint())
23358 Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
23359
23360 auto *MINode = cast<MemIntrinsicSDNode>(N);
23361 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
23362 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
23363 MINode->getMemoryVT(), MINode->getMemOperand(),
23364 ISD::UNINDEXED, false, false);
23365}
23366
23367/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
23368/// load store optimizer pass will merge them to store pair stores. This should
23369/// be better than a movi to create the vector zero followed by a vector store
23370/// if the zero constant is not re-used, since one instructions and one register
23371/// live range will be removed.
23372///
23373/// For example, the final generated code should be:
23374///
23375/// stp xzr, xzr, [x0]
23376///
23377/// instead of:
23378///
23379/// movi v0.2d, #0
23380/// str q0, [x0]
23381///
23383 SDValue StVal = St.getValue();
23384 EVT VT = StVal.getValueType();
23385
23386 // Avoid scalarizing zero splat stores for scalable vectors.
23387 if (VT.isScalableVector())
23388 return SDValue();
23389
23390 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
23391 // 2, 3 or 4 i32 elements.
23392 int NumVecElts = VT.getVectorNumElements();
23393 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
23394 VT.getVectorElementType().getSizeInBits() == 64) ||
23395 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
23396 VT.getVectorElementType().getSizeInBits() == 32)))
23397 return SDValue();
23398
23399 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
23400 return SDValue();
23401
23402 // If the zero constant has more than one use then the vector store could be
23403 // better since the constant mov will be amortized and stp q instructions
23404 // should be able to be formed.
23405 if (!StVal.hasOneUse())
23406 return SDValue();
23407
23408 // If the store is truncating then it's going down to i16 or smaller, which
23409 // means it can be implemented in a single store anyway.
23410 if (St.isTruncatingStore())
23411 return SDValue();
23412
23413 // If the immediate offset of the address operand is too large for the stp
23414 // instruction, then bail out.
23415 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
23416 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
23418 return SDValue();
23419 }
23420
23421 for (int I = 0; I < NumVecElts; ++I) {
23422 SDValue EltVal = StVal.getOperand(I);
23423 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
23424 return SDValue();
23425 }
23426
23427 // Use a CopyFromReg WZR/XZR here to prevent
23428 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
23429 SDLoc DL(&St);
23430 unsigned ZeroReg;
23431 EVT ZeroVT;
23432 if (VT.getVectorElementType().getSizeInBits() == 32) {
23433 ZeroReg = AArch64::WZR;
23434 ZeroVT = MVT::i32;
23435 } else {
23436 ZeroReg = AArch64::XZR;
23437 ZeroVT = MVT::i64;
23438 }
23439 SDValue SplatVal =
23440 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
23441 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
23442}
23443
23444/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
23445/// value. The load store optimizer pass will merge them to store pair stores.
23446/// This has better performance than a splat of the scalar followed by a split
23447/// vector store. Even if the stores are not merged it is four stores vs a dup,
23448/// followed by an ext.b and two stores.
23450 SDValue StVal = St.getValue();
23451 EVT VT = StVal.getValueType();
23452
23453 // Don't replace floating point stores, they possibly won't be transformed to
23454 // stp because of the store pair suppress pass.
23455 if (VT.isFloatingPoint())
23456 return SDValue();
23457
23458 // We can express a splat as store pair(s) for 2 or 4 elements.
23459 unsigned NumVecElts = VT.getVectorNumElements();
23460 if (NumVecElts != 4 && NumVecElts != 2)
23461 return SDValue();
23462
23463 // If the store is truncating then it's going down to i16 or smaller, which
23464 // means it can be implemented in a single store anyway.
23465 if (St.isTruncatingStore())
23466 return SDValue();
23467
23468 // Check that this is a splat.
23469 // Make sure that each of the relevant vector element locations are inserted
23470 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
23471 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
23472 SDValue SplatVal;
23473 for (unsigned I = 0; I < NumVecElts; ++I) {
23474 // Check for insert vector elements.
23475 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
23476 return SDValue();
23477
23478 // Check that same value is inserted at each vector element.
23479 if (I == 0)
23480 SplatVal = StVal.getOperand(1);
23481 else if (StVal.getOperand(1) != SplatVal)
23482 return SDValue();
23483
23484 // Check insert element index.
23486 if (!CIndex)
23487 return SDValue();
23488 uint64_t IndexVal = CIndex->getZExtValue();
23489 if (IndexVal >= NumVecElts)
23490 return SDValue();
23491 IndexNotInserted.reset(IndexVal);
23492
23493 StVal = StVal.getOperand(0);
23494 }
23495 // Check that all vector element locations were inserted to.
23496 if (IndexNotInserted.any())
23497 return SDValue();
23498
23499 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
23500}
23501
23503 SelectionDAG &DAG,
23504 const AArch64Subtarget *Subtarget) {
23505
23507 if (S->isVolatile() || S->isIndexed())
23508 return SDValue();
23509
23510 SDValue StVal = S->getValue();
23511 EVT VT = StVal.getValueType();
23512
23513 if (!VT.isFixedLengthVector())
23514 return SDValue();
23515
23516 // If we get a splat of zeros, convert this vector store to a store of
23517 // scalars. They will be merged into store pairs of xzr thereby removing one
23518 // instruction and one register.
23519 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
23520 return ReplacedZeroSplat;
23521
23522 // FIXME: The logic for deciding if an unaligned store should be split should
23523 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
23524 // a call to that function here.
23525
23526 if (!Subtarget->isMisaligned128StoreSlow())
23527 return SDValue();
23528
23529 // Don't split at -Oz.
23531 return SDValue();
23532
23533 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
23534 // those up regresses performance on micro-benchmarks and olden/bh.
23535 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
23536 return SDValue();
23537
23538 // Split unaligned 16B stores. They are terrible for performance.
23539 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
23540 // extensions can use this to mark that it does not want splitting to happen
23541 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
23542 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
23543 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
23544 S->getAlign() <= Align(2))
23545 return SDValue();
23546
23547 // If we get a splat of a scalar convert this vector store to a store of
23548 // scalars. They will be merged into store pairs thereby removing two
23549 // instructions.
23550 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
23551 return ReplacedSplat;
23552
23553 SDLoc DL(S);
23554
23555 // Split VT into two.
23556 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
23557 unsigned NumElts = HalfVT.getVectorNumElements();
23558 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
23559 DAG.getConstant(0, DL, MVT::i64));
23560 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
23561 DAG.getConstant(NumElts, DL, MVT::i64));
23562 SDValue BasePtr = S->getBasePtr();
23563 SDValue NewST1 =
23564 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
23565 S->getAlign(), S->getMemOperand()->getFlags());
23566 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
23567 DAG.getConstant(8, DL, MVT::i64));
23568 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
23569 S->getPointerInfo(), S->getAlign(),
23570 S->getMemOperand()->getFlags());
23571}
23572
23574 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexpected Opcode!");
23575
23576 // splice(pg, op1, undef) -> op1
23577 if (N->getOperand(2).isUndef())
23578 return N->getOperand(1);
23579
23580 return SDValue();
23581}
23582
23584 const AArch64Subtarget *Subtarget) {
23585 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
23586 N->getOpcode() == AArch64ISD::UUNPKLO) &&
23587 "Unexpected Opcode!");
23588
23589 // uunpklo/hi undef -> undef
23590 if (N->getOperand(0).isUndef())
23591 return DAG.getUNDEF(N->getValueType(0));
23592
23593 // If this is a masked load followed by an UUNPKLO, fold this into a masked
23594 // extending load. We can do this even if this is already a masked
23595 // {z,}extload.
23596 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
23597 N->getOpcode() == AArch64ISD::UUNPKLO) {
23598 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
23599 SDValue Mask = MLD->getMask();
23600 SDLoc DL(N);
23601
23602 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
23603 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
23604 (MLD->getPassThru()->isUndef() ||
23605 isZerosVector(MLD->getPassThru().getNode()))) {
23606 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
23607 unsigned PgPattern = Mask->getConstantOperandVal(0);
23608 EVT VT = N->getValueType(0);
23609
23610 // Ensure we can double the size of the predicate pattern
23611 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
23612 if (NumElts &&
23613 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
23614 Mask =
23615 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
23616 SDValue PassThru = DAG.getConstant(0, DL, VT);
23617 SDValue NewLoad = DAG.getMaskedLoad(
23618 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
23619 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
23621
23622 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
23623
23624 return NewLoad;
23625 }
23626 }
23627 }
23628
23629 return SDValue();
23630}
23631
23633 if (N->getOpcode() != AArch64ISD::UZP1)
23634 return false;
23635 SDValue Op0 = N->getOperand(0);
23636 EVT SrcVT = Op0->getValueType(0);
23637 EVT DstVT = N->getValueType(0);
23638 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
23639 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
23640 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
23641}
23642
23643// Try to combine rounding shifts where the operands come from an extend, and
23644// the result is truncated and combined into one vector.
23645// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
23647 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
23648 SDValue Op0 = N->getOperand(0);
23649 SDValue Op1 = N->getOperand(1);
23650 EVT ResVT = N->getValueType(0);
23651
23652 unsigned RshOpc = Op0.getOpcode();
23653 if (RshOpc != AArch64ISD::RSHRNB_I)
23654 return SDValue();
23655
23656 // Same op code and imm value?
23657 SDValue ShiftValue = Op0.getOperand(1);
23658 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
23659 return SDValue();
23660
23661 // Same unextended operand value?
23662 SDValue Lo = Op0.getOperand(0);
23663 SDValue Hi = Op1.getOperand(0);
23664 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
23665 Hi.getOpcode() != AArch64ISD::UUNPKHI)
23666 return SDValue();
23667 SDValue OrigArg = Lo.getOperand(0);
23668 if (OrigArg != Hi.getOperand(0))
23669 return SDValue();
23670
23671 SDLoc DL(N);
23672 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
23673 getPredicateForVector(DAG, DL, ResVT), OrigArg,
23674 ShiftValue);
23675}
23676
23677// Try to simplify:
23678// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
23679// t2 = nxv8i16 srl(t1, ShiftValue)
23680// to
23681// t1 = nxv8i16 rshrnb(X, shiftvalue).
23682// rshrnb will zero the top half bits of each element. Therefore, this combine
23683// should only be performed when a following instruction with the rshrnb
23684// as an operand does not care about the top half of each element. For example,
23685// a uzp1 or a truncating store.
23687 const AArch64Subtarget *Subtarget) {
23688 EVT VT = Srl->getValueType(0);
23689 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
23690 return SDValue();
23691
23692 EVT ResVT;
23693 if (VT == MVT::nxv8i16)
23694 ResVT = MVT::nxv16i8;
23695 else if (VT == MVT::nxv4i32)
23696 ResVT = MVT::nxv8i16;
23697 else if (VT == MVT::nxv2i64)
23698 ResVT = MVT::nxv4i32;
23699 else
23700 return SDValue();
23701
23702 SDLoc DL(Srl);
23703 unsigned ShiftValue;
23704 SDValue RShOperand;
23705 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
23706 return SDValue();
23707 SDValue Rshrnb = DAG.getNode(
23708 AArch64ISD::RSHRNB_I, DL, ResVT,
23709 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
23710 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Rshrnb);
23711}
23712
23714 if (V.getOpcode() != AArch64ISD::NVCAST)
23715 return SDValue();
23716
23717 SDValue Op = V.getOperand(0);
23718 if (!Op.getValueType().isVector() ||
23719 V.getValueType().getVectorElementCount() !=
23720 Op.getValueType().getVectorElementCount() * 2)
23721 return SDValue();
23722
23723 return Op;
23724}
23725
23727 const AArch64Subtarget *Subtarget) {
23728 SDLoc DL(N);
23729 SDValue Op0 = N->getOperand(0);
23730 SDValue Op1 = N->getOperand(1);
23731 EVT ResVT = N->getValueType(0);
23732
23733 // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
23734 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23736 Op0.getOperand(0) == Op1.getOperand(0)) {
23737
23738 SDValue SourceVec = Op0.getOperand(0);
23739 uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
23740 uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
23741 uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
23742 if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
23743 EVT OpVT = Op0.getOperand(1).getValueType();
23744 EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
23745 SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
23746 DAG.getUNDEF(WidenedResVT));
23747 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
23748 DAG.getConstant(0, DL, OpVT));
23749 }
23750 }
23751
23752 // Following optimizations only work with uzp1.
23753 if (N->getOpcode() == AArch64ISD::UZP2)
23754 return SDValue();
23755
23756 // uzp1(x, undef) -> concat(truncate(x), undef)
23757 if (Op1.getOpcode() == ISD::UNDEF) {
23758 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
23759 switch (ResVT.getSimpleVT().SimpleTy) {
23760 default:
23761 break;
23762 case MVT::v16i8:
23763 BCVT = MVT::v8i16;
23764 HalfVT = MVT::v8i8;
23765 break;
23766 case MVT::v8i16:
23767 BCVT = MVT::v4i32;
23768 HalfVT = MVT::v4i16;
23769 break;
23770 case MVT::v4i32:
23771 BCVT = MVT::v2i64;
23772 HalfVT = MVT::v2i32;
23773 break;
23774 }
23775 if (BCVT != MVT::Other) {
23776 SDValue BC = DAG.getBitcast(BCVT, Op0);
23777 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
23778 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
23779 DAG.getUNDEF(HalfVT));
23780 }
23781 }
23782
23783 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
23784 return Urshr;
23785
23786 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
23787 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
23788 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
23789 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
23790 }
23791 }
23792
23793 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
23794 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
23795 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
23796 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
23797 }
23798 }
23799
23800 // uzp1<ty>(nvcast(unpklo(uzp1<ty>(x, y))), z) => uzp1<ty>(x, z)
23801 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
23802 if (PreCast.getOpcode() == AArch64ISD::UUNPKLO) {
23803 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
23804 SDValue X = PreCast.getOperand(0).getOperand(0);
23805 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
23806 }
23807 }
23808 }
23809
23810 // uzp1<ty>(x, nvcast(unpkhi(uzp1<ty>(y, z)))) => uzp1<ty>(x, z)
23811 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
23812 if (PreCast.getOpcode() == AArch64ISD::UUNPKHI) {
23813 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
23814 SDValue Z = PreCast.getOperand(0).getOperand(1);
23815 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
23816 }
23817 }
23818 }
23819
23820 // These optimizations only work on little endian.
23821 if (!DAG.getDataLayout().isLittleEndian())
23822 return SDValue();
23823
23824 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
23825 // Example:
23826 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
23827 // to
23828 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
23830 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
23831 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
23832 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
23833 Op1.getOperand(0));
23834 }
23835 }
23836
23837 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
23838 return SDValue();
23839
23840 SDValue SourceOp0 = peekThroughBitcasts(Op0);
23841 SDValue SourceOp1 = peekThroughBitcasts(Op1);
23842
23843 // truncating uzp1(x, y) -> xtn(concat (x, y))
23844 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
23845 EVT Op0Ty = SourceOp0.getValueType();
23846 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
23847 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
23848 SDValue Concat =
23851 SourceOp0, SourceOp1);
23852 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
23853 }
23854 }
23855
23856 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
23857 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
23858 SourceOp1.getOpcode() != ISD::TRUNCATE)
23859 return SDValue();
23860 SourceOp0 = SourceOp0.getOperand(0);
23861 SourceOp1 = SourceOp1.getOperand(0);
23862
23863 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
23864 !SourceOp0.getValueType().isSimple())
23865 return SDValue();
23866
23867 EVT ResultTy;
23868
23869 switch (SourceOp0.getSimpleValueType().SimpleTy) {
23870 case MVT::v2i64:
23871 ResultTy = MVT::v4i32;
23872 break;
23873 case MVT::v4i32:
23874 ResultTy = MVT::v8i16;
23875 break;
23876 case MVT::v8i16:
23877 ResultTy = MVT::v16i8;
23878 break;
23879 default:
23880 return SDValue();
23881 }
23882
23883 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
23884 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
23885 SDValue UzpResult =
23886 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
23887
23888 EVT BitcastResultTy;
23889
23890 switch (ResVT.getSimpleVT().SimpleTy) {
23891 case MVT::v2i32:
23892 BitcastResultTy = MVT::v2i64;
23893 break;
23894 case MVT::v4i16:
23895 BitcastResultTy = MVT::v4i32;
23896 break;
23897 case MVT::v8i8:
23898 BitcastResultTy = MVT::v8i16;
23899 break;
23900 default:
23901 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
23902 }
23903
23904 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
23905 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
23906}
23907
23909 unsigned Opc = N->getOpcode();
23910
23911 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
23912 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23913 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
23914 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23915 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
23916 Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
23917 Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
23918 Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
23919
23920 SDLoc DL(N);
23921 SDValue Chain = N->getOperand(0);
23922 SDValue Pg = N->getOperand(1);
23923 SDValue Base = N->getOperand(2);
23924 SDValue Offset = N->getOperand(3);
23925 SDValue Ty = N->getOperand(4);
23926
23927 EVT ResVT = N->getValueType(0);
23928
23929 const auto OffsetOpc = Offset.getOpcode();
23930 const bool OffsetIsZExt =
23931 OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
23932 const bool OffsetIsSExt =
23933 OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
23934
23935 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
23936 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
23937 SDValue ExtPg = Offset.getOperand(0);
23938 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
23939 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
23940
23941 // If the predicate for the sign- or zero-extended offset is the
23942 // same as the predicate used for this load and the sign-/zero-extension
23943 // was from a 32-bits...
23944 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
23945 SDValue UnextendedOffset = Offset.getOperand(1);
23946
23947 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
23948 if (Signed)
23949 NewOpc = getSignExtendedGatherOpcode(NewOpc);
23950
23951 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
23952 {Chain, Pg, Base, UnextendedOffset, Ty});
23953 }
23954 }
23955
23956 return SDValue();
23957}
23958
23959/// Optimize a vector shift instruction and its operand if shifted out
23960/// bits are not used.
23962 const AArch64TargetLowering &TLI,
23964 assert(N->getOpcode() == AArch64ISD::VASHR ||
23965 N->getOpcode() == AArch64ISD::VLSHR);
23966
23967 SDValue Op = N->getOperand(0);
23968 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
23969
23970 unsigned ShiftImm = N->getConstantOperandVal(1);
23971 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
23972
23973 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
23974 if (N->getOpcode() == AArch64ISD::VASHR &&
23975 Op.getOpcode() == AArch64ISD::VSHL &&
23976 N->getOperand(1) == Op.getOperand(1))
23977 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
23978 return Op.getOperand(0);
23979
23980 // If the shift is exact, the shifted out bits matter.
23981 if (N->getFlags().hasExact())
23982 return SDValue();
23983
23984 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
23985 APInt DemandedMask = ~ShiftedOutBits;
23986
23987 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
23988 return SDValue(N, 0);
23989
23990 return SDValue();
23991}
23992
23994 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
23995 // This transform works in partnership with performSetCCPunpkCombine to
23996 // remove unnecessary transfer of predicates into standard registers and back
23997 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
23998 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
23999 MVT::i1) {
24000 SDValue CC = N->getOperand(0)->getOperand(0);
24001 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
24002 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
24003 DAG.getVectorIdxConstant(0, SDLoc(N)));
24004 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
24005 }
24006
24007 return SDValue();
24008}
24009
24010/// Target-specific DAG combine function for post-increment LD1 (lane) and
24011/// post-increment LD1R.
24014 bool IsLaneOp) {
24015 if (DCI.isBeforeLegalizeOps())
24016 return SDValue();
24017
24018 SelectionDAG &DAG = DCI.DAG;
24019 EVT VT = N->getValueType(0);
24020
24021 if (!VT.is128BitVector() && !VT.is64BitVector())
24022 return SDValue();
24023
24024 // If it is not LOAD, can not do such combine.
24025 unsigned LoadIdx = IsLaneOp ? 1 : 0;
24026 LoadSDNode *LD = dyn_cast<LoadSDNode>(N->getOperand(LoadIdx).getNode());
24027 if (!LD)
24028 return SDValue();
24029
24030 // If the Generic combiner already helped form a pre- or post-indexed load,
24031 // skip forming one here.
24032 if (LD->isIndexed())
24033 return SDValue();
24034
24035 // The vector lane must be a constant in the LD1LANE opcode.
24036 SDValue Lane;
24037 if (IsLaneOp) {
24038 Lane = N->getOperand(2);
24039 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
24040 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
24041 return SDValue();
24042 if (LaneC->getZExtValue() == 0 && isNullOrNullSplat(N->getOperand(0)))
24043 return SDValue();
24044 }
24045
24046 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
24047 EVT MemVT = LoadSDN->getMemoryVT();
24048 // Check if memory operand is the same type as the vector element.
24049 if (MemVT != VT.getVectorElementType())
24050 return SDValue();
24051
24052 // Check if there are other uses. If so, do not combine as it will introduce
24053 // an extra load.
24054 for (SDUse &U : LD->uses()) {
24055 if (U.getResNo() == 1) // Ignore uses of the chain result.
24056 continue;
24057 if (U.getUser() != N)
24058 return SDValue();
24059 }
24060
24061 // If there is one use and it can splat the value, prefer that operation.
24062 // TODO: This could be expanded to more operations if they reliably use the
24063 // index variants.
24064 if (N->hasOneUse()) {
24065 unsigned UseOpc = N->user_begin()->getOpcode();
24066 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
24067 return SDValue();
24068 }
24069
24070 SDValue Addr = LD->getOperand(1);
24071 SDValue Vector = N->getOperand(0);
24072 // Search for a use of the address operand that is an increment.
24073 for (SDUse &Use : Addr->uses()) {
24074 SDNode *User = Use.getUser();
24075 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
24076 continue;
24077
24078 // If the increment is a constant, it must match the memory ref size.
24079 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
24080 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
24081 uint32_t IncVal = CInc->getZExtValue();
24082 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
24083 if (IncVal != NumBytes)
24084 continue;
24085 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
24086 }
24087
24088 // To avoid cycle construction make sure that neither the load nor the add
24089 // are predecessors to each other or the Vector.
24092 Visited.insert(Addr.getNode());
24093 Worklist.push_back(User);
24094 Worklist.push_back(LD);
24095 Worklist.push_back(Vector.getNode());
24096 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
24097 SDNode::hasPredecessorHelper(User, Visited, Worklist))
24098 continue;
24099
24101 Ops.push_back(LD->getOperand(0)); // Chain
24102 if (IsLaneOp) {
24103 Ops.push_back(Vector); // The vector to be inserted
24104 Ops.push_back(Lane); // The lane to be inserted in the vector
24105 }
24106 Ops.push_back(Addr);
24107 Ops.push_back(Inc);
24108
24109 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
24110 SDVTList SDTys = DAG.getVTList(Tys);
24111 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
24112 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
24113 MemVT,
24114 LoadSDN->getMemOperand());
24115
24116 // Update the uses.
24117 SDValue NewResults[] = {
24118 SDValue(LD, 0), // The result of load
24119 SDValue(UpdN.getNode(), 2) // Chain
24120 };
24121 DCI.CombineTo(LD, NewResults);
24122 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
24123 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
24124
24125 break;
24126 }
24127 return SDValue();
24128}
24129
24130/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
24131/// address translation.
24132static bool performTBISimplification(SDValue Addr,
24134 SelectionDAG &DAG) {
24135 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
24136 KnownBits Known;
24138 !DCI.isBeforeLegalizeOps());
24139 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24140 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
24141 DCI.CommitTargetLoweringOpt(TLO);
24142 return true;
24143 }
24144 return false;
24145}
24146
24147static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
24148 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
24149 "Expected STORE dag node in input!");
24150
24151 if (auto Store = dyn_cast<StoreSDNode>(N)) {
24152 if (!Store->isTruncatingStore() || Store->isIndexed())
24153 return SDValue();
24154 SDValue Ext = Store->getValue();
24155 auto ExtOpCode = Ext.getOpcode();
24156 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
24157 ExtOpCode != ISD::ANY_EXTEND)
24158 return SDValue();
24159 SDValue Orig = Ext->getOperand(0);
24160 if (Store->getMemoryVT() != Orig.getValueType())
24161 return SDValue();
24162 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
24163 Store->getBasePtr(), Store->getMemOperand());
24164 }
24165
24166 return SDValue();
24167}
24168
24169// A custom combine to lower load <3 x i8> as the more efficient sequence
24170// below:
24171// ldrb wX, [x0, #2]
24172// ldrh wY, [x0]
24173// orr wX, wY, wX, lsl #16
24174// fmov s0, wX
24175//
24176// Note that an alternative sequence with even fewer (although usually more
24177// complex/expensive) instructions would be:
24178// ld1r.4h { v0 }, [x0], #2
24179// ld1.b { v0 }[2], [x0]
24180//
24181// Generating this sequence unfortunately results in noticeably worse codegen
24182// for code that extends the loaded v3i8, due to legalization breaking vector
24183// shuffle detection in a way that is very difficult to work around.
24184// TODO: Revisit once v3i8 legalization has been improved in general.
24185static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
24186 EVT MemVT = LD->getMemoryVT();
24187 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
24188 LD->getBaseAlign() >= 4)
24189 return SDValue();
24190
24191 SDLoc DL(LD);
24193 SDValue Chain = LD->getChain();
24194 SDValue BasePtr = LD->getBasePtr();
24195 MachineMemOperand *MMO = LD->getMemOperand();
24196 assert(LD->getOffset().isUndef() && "undef offset expected");
24197
24198 // Load 2 x i8, then 1 x i8.
24199 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
24200 TypeSize Offset2 = TypeSize::getFixed(2);
24201 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
24202 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
24203 MF.getMachineMemOperand(MMO, 2, 1));
24204
24205 // Extend to i32.
24206 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
24207 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
24208
24209 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
24210 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
24211 DAG.getConstant(16, DL, MVT::i32));
24212 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
24213 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
24214
24215 // Extract v3i8 again.
24216 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
24217 DAG.getConstant(0, DL, MVT::i64));
24219 ISD::TokenFactor, DL, MVT::Other,
24220 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
24221 return DAG.getMergeValues({Extract, TokenFactor}, DL);
24222}
24223
24224// Perform TBI simplification if supported by the target and try to break up
24225// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
24226// load instructions can be selected.
24227static SDValue performLOADCombine(SDNode *N,
24229 SelectionDAG &DAG,
24230 const AArch64Subtarget *Subtarget) {
24231 if (Subtarget->supportsAddressTopByteIgnored())
24232 performTBISimplification(N->getOperand(1), DCI, DAG);
24233
24235 EVT RegVT = LD->getValueType(0);
24236 EVT MemVT = LD->getMemoryVT();
24237 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24238 SDLoc DL(LD);
24239
24240 // Cast ptr32 and ptr64 pointers to the default address space before a load.
24241 unsigned AddrSpace = LD->getAddressSpace();
24242 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
24243 AddrSpace == ARM64AS::PTR32_UPTR) {
24244 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
24245 if (PtrVT != LD->getBasePtr().getSimpleValueType()) {
24246 SDValue Cast =
24247 DAG.getAddrSpaceCast(DL, PtrVT, LD->getBasePtr(), AddrSpace, 0);
24248 return DAG.getExtLoad(LD->getExtensionType(), DL, RegVT, LD->getChain(),
24249 Cast, LD->getPointerInfo(), MemVT,
24250 LD->getBaseAlign(),
24251 LD->getMemOperand()->getFlags());
24252 }
24253 }
24254
24255 if (LD->isVolatile() || !Subtarget->isLittleEndian())
24256 return SDValue(N, 0);
24257
24258 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
24259 return Res;
24260
24261 if (!LD->isNonTemporal())
24262 return SDValue(N, 0);
24263
24264 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
24265 MemVT.getSizeInBits() % 256 == 0 ||
24266 256 % MemVT.getScalarSizeInBits() != 0)
24267 return SDValue(N, 0);
24268
24269 SDValue Chain = LD->getChain();
24270 SDValue BasePtr = LD->getBasePtr();
24271 SDNodeFlags Flags = LD->getFlags();
24273 SmallVector<SDValue, 4> LoadOpsChain;
24274 // Replace any non temporal load over 256-bit with a series of 256 bit loads
24275 // and a scalar/vector load less than 256. This way we can utilize 256-bit
24276 // loads and reduce the amount of load instructions generated.
24277 MVT NewVT =
24279 256 / MemVT.getVectorElementType().getSizeInBits());
24280 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
24281 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
24282 for (unsigned I = 0; I < Num256Loads; I++) {
24283 unsigned PtrOffset = I * 32;
24284 SDValue NewPtr = DAG.getMemBasePlusOffset(
24285 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
24286 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
24287 SDValue NewLoad = DAG.getLoad(
24288 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
24289 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
24290 LoadOps.push_back(NewLoad);
24291 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
24292 }
24293
24294 // Process remaining bits of the load operation.
24295 // This is done by creating an UNDEF vector to match the size of the
24296 // 256-bit loads and inserting the remaining load to it. We extract the
24297 // original load type at the end using EXTRACT_SUBVECTOR instruction.
24298 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
24299 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
24300 MVT RemainingVT = MVT::getVectorVT(
24302 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
24303 SDValue NewPtr = DAG.getMemBasePlusOffset(
24304 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
24305 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
24306 SDValue RemainingLoad =
24307 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
24308 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
24309 LD->getMemOperand()->getFlags(), LD->getAAInfo());
24310 SDValue UndefVector = DAG.getUNDEF(NewVT);
24311 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
24312 SDValue ExtendedRemainingLoad =
24313 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
24314 {UndefVector, RemainingLoad, InsertIdx});
24315 LoadOps.push_back(ExtendedRemainingLoad);
24316 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
24317 EVT ConcatVT =
24319 LoadOps.size() * NewVT.getVectorNumElements());
24320 SDValue ConcatVectors =
24321 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
24322 // Extract the original vector type size.
24323 SDValue ExtractSubVector =
24324 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
24325 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
24327 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
24328 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
24329}
24330
24331static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = 0) {
24332 EVT VecVT = Op.getValueType();
24333 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
24334 "Need boolean vector type.");
24335
24336 if (Depth > 3)
24338
24339 // We can get the base type from a vector compare or truncate.
24340 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
24341 return Op.getOperand(0).getValueType();
24342
24343 // If an operand is a bool vector, continue looking.
24345 for (SDValue Operand : Op->op_values()) {
24346 if (Operand.getValueType() != VecVT)
24347 continue;
24348
24349 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
24350 if (!BaseVT.isSimple())
24351 BaseVT = OperandVT;
24352 else if (OperandVT != BaseVT)
24354 }
24355
24356 return BaseVT;
24357}
24358
24359// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
24360// iN, we can use a trick that extracts the i^th bit from the i^th element and
24361// then performs a vector add to get a scalar bitmask. This requires that each
24362// element's bits are either all 1 or all 0.
24363static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
24364 SDLoc DL(N);
24365 SDValue ComparisonResult(N, 0);
24366 EVT VecVT = ComparisonResult.getValueType();
24367 assert(VecVT.isVector() && "Must be a vector type");
24368
24369 unsigned NumElts = VecVT.getVectorNumElements();
24370 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
24371 return SDValue();
24372
24373 if (VecVT.getVectorElementType() != MVT::i1 &&
24374 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
24375 return SDValue();
24376
24377 // If we can find the original types to work on instead of a vector of i1,
24378 // we can avoid extend/extract conversion instructions.
24379 if (VecVT.getVectorElementType() == MVT::i1) {
24380 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
24381 if (!VecVT.isSimple()) {
24382 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
24383 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
24384 }
24385 }
24386 VecVT = VecVT.changeVectorElementTypeToInteger();
24387
24388 // Large vectors don't map directly to this conversion, so to avoid too many
24389 // edge cases, we don't apply it here. The conversion will likely still be
24390 // applied later via multiple smaller vectors, whose results are concatenated.
24391 if (VecVT.getSizeInBits() > 128)
24392 return SDValue();
24393
24394 // Ensure that all elements' bits are either 0s or 1s.
24395 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
24396
24397 bool IsLE = DAG.getDataLayout().isLittleEndian();
24398 SmallVector<SDValue, 16> MaskConstants;
24400 VecVT == MVT::v16i8) {
24401 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
24402 // per entry. We split it into two halves, apply the mask, zip the halves to
24403 // create 8x 16-bit values, and the perform the vector reduce.
24404 for (unsigned Half = 0; Half < 2; ++Half) {
24405 for (unsigned I = 0; I < 8; ++I) {
24406 // On big-endian targets, the lane order in sub-byte vector elements
24407 // gets reversed, so we need to flip the bit index.
24408 unsigned MaskBit = IsLE ? (1u << I) : (1u << (7 - I));
24409 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
24410 }
24411 }
24412 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
24413 SDValue RepresentativeBits =
24414 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
24415
24416 SDValue UpperRepresentativeBits =
24417 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
24418 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
24419 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
24420 RepresentativeBits, UpperRepresentativeBits);
24421 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
24422 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
24423 }
24424
24425 // All other vector sizes.
24426 unsigned NumEl = VecVT.getVectorNumElements();
24427 for (unsigned I = 0; I < NumEl; ++I) {
24428 unsigned MaskBit = IsLE ? (1u << I) : (1u << (NumEl - 1 - I));
24429 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
24430 }
24431
24432 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
24433 SDValue RepresentativeBits =
24434 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
24435 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
24436 NumElts, VecVT.getVectorElementType().getSizeInBits()));
24437 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
24438}
24439
24440static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
24441 StoreSDNode *Store) {
24442 if (!Store->isTruncatingStore())
24443 return SDValue();
24444
24445 SDLoc DL(Store);
24446 SDValue VecOp = Store->getValue();
24447 EVT VT = VecOp.getValueType();
24448 EVT MemVT = Store->getMemoryVT();
24449
24450 if (!MemVT.isVector() || !VT.isVector() ||
24451 MemVT.getVectorElementType() != MVT::i1)
24452 return SDValue();
24453
24454 // If we are storing a vector that we are currently building, let
24455 // `scalarizeVectorStore()` handle this more efficiently.
24456 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
24457 return SDValue();
24458
24459 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
24460 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
24461 if (!VectorBits)
24462 return SDValue();
24463
24464 EVT StoreVT =
24466 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
24467 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
24468 Store->getMemOperand());
24469}
24470
24471// Combine store (fp_to_int X) to use vector semantics around the conversion
24472// when NEON is available. This allows us to store the in-vector result directly
24473// without transferring the result into a GPR in the process.
24474static SDValue combineStoreValueFPToInt(StoreSDNode *ST,
24476 SelectionDAG &DAG,
24477 const AArch64Subtarget *Subtarget) {
24478 // Limit to post-legalization in order to avoid peeling truncating stores.
24479 if (DCI.isBeforeLegalize())
24480 return SDValue();
24481 if (!Subtarget->isNeonAvailable())
24482 return SDValue();
24483 // Source operand is already a vector.
24484 SDValue Value = ST->getValue();
24485 if (Value.getValueType().isVector())
24486 return SDValue();
24487
24488 // Look through potential assertions.
24489 while (Value->isAssert())
24490 Value = Value.getOperand(0);
24491
24492 if (Value.getOpcode() != ISD::FP_TO_SINT &&
24493 Value.getOpcode() != ISD::FP_TO_UINT)
24494 return SDValue();
24495 if (!Value->hasOneUse())
24496 return SDValue();
24497
24498 SDValue FPSrc = Value.getOperand(0);
24499 EVT SrcVT = FPSrc.getValueType();
24500 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
24501 return SDValue();
24502
24503 // No support for assignments such as i64 = fp_to_sint i32
24504 EVT VT = Value.getSimpleValueType();
24505 if (VT != SrcVT.changeTypeToInteger())
24506 return SDValue();
24507
24508 // Create a 128-bit element vector to avoid widening. The floating point
24509 // conversion is transformed into a single element conversion via a pattern.
24510 unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();
24511 EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);
24512 EVT VecDstVT = VecSrcVT.changeTypeToInteger();
24513 SDLoc DL(ST);
24514 SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);
24515 SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);
24516
24518 SDValue Extracted =
24519 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);
24520
24521 DCI.CombineTo(ST->getValue().getNode(), Extracted);
24522 return SDValue(ST, 0);
24523}
24524
24525bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
24526 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
24527 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
24528 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
24529}
24530
24531// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
24532static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
24533 const AArch64Subtarget *Subtarget) {
24534 SDValue Value = ST->getValue();
24535 EVT ValueVT = Value.getValueType();
24536
24537 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
24538 Value.getOpcode() != ISD::TRUNCATE ||
24539 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
24540 return SDValue();
24541
24542 assert(ST->getOffset().isUndef() && "undef offset expected");
24543 SDLoc DL(ST);
24544 auto WideVT = EVT::getVectorVT(
24545 *DAG.getContext(),
24546 Value->getOperand(0).getValueType().getVectorElementType(), 4);
24547 SDValue UndefVector = DAG.getUNDEF(WideVT);
24548 SDValue WideTrunc = DAG.getNode(
24549 ISD::INSERT_SUBVECTOR, DL, WideVT,
24550 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
24551 SDValue Cast = DAG.getNode(
24552 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
24553 WideTrunc);
24554
24556 SDValue Chain = ST->getChain();
24557 MachineMemOperand *MMO = ST->getMemOperand();
24558 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
24559 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
24560 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
24561 TypeSize Offset2 = TypeSize::getFixed(2);
24562 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
24563 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
24564
24565 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
24566 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
24567 TypeSize Offset1 = TypeSize::getFixed(1);
24568 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
24569 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
24570
24571 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
24572 DAG.getConstant(0, DL, MVT::i64));
24573 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
24574 MF.getMachineMemOperand(MMO, 0, 1));
24575 return Chain;
24576}
24577
24578static unsigned getFPSubregForVT(EVT VT) {
24579 assert(VT.isSimple() && "Expected simple VT");
24580 switch (VT.getSimpleVT().SimpleTy) {
24581 case MVT::aarch64mfp8:
24582 return AArch64::bsub;
24583 case MVT::f16:
24584 return AArch64::hsub;
24585 case MVT::f32:
24586 return AArch64::ssub;
24587 case MVT::f64:
24588 return AArch64::dsub;
24589 default:
24590 llvm_unreachable("Unexpected VT!");
24591 }
24592}
24593
24594static SDValue performSTORECombine(SDNode *N,
24596 SelectionDAG &DAG,
24597 const AArch64Subtarget *Subtarget) {
24599 SDValue Chain = ST->getChain();
24600 SDValue Value = ST->getValue();
24601 SDValue Ptr = ST->getBasePtr();
24602 EVT ValueVT = Value.getValueType();
24603 EVT MemVT = ST->getMemoryVT();
24604 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24605 SDLoc DL(ST);
24606
24607 if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))
24608 return Res;
24609
24610 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
24611 EVT EltVT = VT.getVectorElementType();
24612 return EltVT == MVT::f32 || EltVT == MVT::f64;
24613 };
24614
24615 // Cast ptr32 and ptr64 pointers to the default address space before a store.
24616 unsigned AddrSpace = ST->getAddressSpace();
24617 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
24618 AddrSpace == ARM64AS::PTR32_UPTR) {
24619 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
24620 if (PtrVT != Ptr.getSimpleValueType()) {
24621 SDValue Cast = DAG.getAddrSpaceCast(DL, PtrVT, Ptr, AddrSpace, 0);
24622 return DAG.getStore(Chain, DL, Value, Cast, ST->getPointerInfo(),
24623 ST->getBaseAlign(), ST->getMemOperand()->getFlags(),
24624 ST->getAAInfo());
24625 }
24626 }
24627
24628 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
24629 return Res;
24630
24631 // If this is an FP_ROUND followed by a store, fold this into a truncating
24632 // store. We can do this even if this is already a truncstore.
24633 // We purposefully don't care about legality of the nodes here as we know
24634 // they can be split down into something legal.
24635 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
24636 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
24637 Subtarget->useSVEForFixedLengthVectors() &&
24638 ValueVT.isFixedLengthVector() &&
24639 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
24640 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
24641 return DAG.getTruncStore(Chain, DL, Value.getOperand(0), Ptr, MemVT,
24642 ST->getMemOperand());
24643
24644 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
24645 return Split;
24646
24647 if (Subtarget->supportsAddressTopByteIgnored() &&
24648 performTBISimplification(N->getOperand(2), DCI, DAG))
24649 return SDValue(N, 0);
24650
24651 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
24652 return Store;
24653
24654 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
24655 return Store;
24656
24657 if (ST->isTruncatingStore() &&
24658 isHalvingTruncateOfLegalScalableType(ValueVT, MemVT)) {
24659 if (SDValue Rshrnb =
24660 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
24661 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
24662 MemVT, ST->getMemOperand());
24663 }
24664 }
24665
24666 // This is an integer vector_extract_elt followed by a (possibly truncating)
24667 // store. We may be able to replace this with a store of an FP subregister.
24668 if (DCI.isAfterLegalizeDAG() && ST->isUnindexed() &&
24669 Value.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24670
24671 SDValue Vector = Value.getOperand(0);
24672 SDValue ExtIdx = Value.getOperand(1);
24673 EVT VectorVT = Vector.getValueType();
24674 EVT ElemVT = VectorVT.getVectorElementType();
24675
24676 if (!ValueVT.isInteger())
24677 return SDValue();
24678
24679 // Propagate zero constants (applying this fold may miss optimizations).
24681 SDValue ZeroElt = DAG.getConstant(0, DL, ValueVT);
24682 DAG.ReplaceAllUsesWith(Value, ZeroElt);
24683 return SDValue();
24684 }
24685
24686 if (ValueVT != MemVT && !ST->isTruncatingStore())
24687 return SDValue();
24688
24689 // This could generate an additional extract if the index is non-zero and
24690 // the extracted value has multiple uses.
24691 auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);
24692 if ((!ExtCst || !ExtCst->isZero()) && !Value.hasOneUse())
24693 return SDValue();
24694
24695 // These can lower to st1, which is preferable if we're unlikely to fold the
24696 // addressing into the store.
24697 if (Subtarget->isNeonAvailable() && ElemVT == MemVT &&
24698 (VectorVT.is64BitVector() || VectorVT.is128BitVector()) && ExtCst &&
24699 !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD)
24700 return SDValue();
24701
24702 if (MemVT == MVT::i64 || MemVT == MVT::i32) {
24703 // Heuristic: If there are other users of w/x integer scalars extracted
24704 // from this vector that won't fold into the store -- abandon folding.
24705 // Applying this fold may disrupt paired stores.
24706 for (const auto &Use : Vector->uses()) {
24707 if (Use.getResNo() != Vector.getResNo())
24708 continue;
24709 const SDNode *User = Use.getUser();
24710 if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24711 (!User->hasOneUse() ||
24712 (*User->user_begin())->getOpcode() != ISD::STORE))
24713 return SDValue();
24714 }
24715 }
24716
24717 SDValue ExtVector = Vector;
24718 if (!ExtCst || !ExtCst->isZero()) {
24719 // Handle extracting from lanes != 0.
24721 Value.getValueType(), Vector, ExtIdx);
24723 ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT,
24724 DAG.getUNDEF(VectorVT), Ext, Zero);
24725 }
24726
24727 EVT FPMemVT = MemVT == MVT::i8
24728 ? MVT::aarch64mfp8
24730 SDValue FPSubreg = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
24731 FPMemVT, ExtVector);
24732
24733 return DAG.getStore(ST->getChain(), DL, FPSubreg, ST->getBasePtr(),
24734 ST->getMemOperand());
24735 }
24736
24737 return SDValue();
24738}
24739
24740static bool
24741isSequentialConcatOfVectorInterleave(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
24742 if (N->getOpcode() != ISD::CONCAT_VECTORS)
24743 return false;
24744
24745 unsigned NumParts = N->getNumOperands();
24746
24747 // We should be concatenating each sequential result from a
24748 // VECTOR_INTERLEAVE.
24749 SDNode *InterleaveOp = N->getOperand(0).getNode();
24750 if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE ||
24751 InterleaveOp->getNumOperands() != NumParts)
24752 return false;
24753
24754 for (unsigned I = 0; I < NumParts; I++)
24755 if (N->getOperand(I) != SDValue(InterleaveOp, I))
24756 return false;
24757
24758 Ops.append(InterleaveOp->op_begin(), InterleaveOp->op_end());
24759 return true;
24760}
24761
24762static SDValue getNarrowMaskForInterleavedOps(SelectionDAG &DAG, SDLoc &DL,
24763 SDValue WideMask,
24764 unsigned RequiredNumParts) {
24765 if (WideMask->getOpcode() == ISD::CONCAT_VECTORS) {
24766 SmallVector<SDValue, 4> MaskInterleaveOps;
24767 if (!isSequentialConcatOfVectorInterleave(WideMask.getNode(),
24768 MaskInterleaveOps))
24769 return SDValue();
24770
24771 if (MaskInterleaveOps.size() != RequiredNumParts)
24772 return SDValue();
24773
24774 // Make sure the inputs to the vector interleave are identical.
24775 if (!llvm::all_equal(MaskInterleaveOps))
24776 return SDValue();
24777
24778 return MaskInterleaveOps[0];
24779 }
24780
24781 if (WideMask->getOpcode() != ISD::SPLAT_VECTOR)
24782 return SDValue();
24783
24785 assert(EC.isKnownMultipleOf(RequiredNumParts) &&
24786 "Expected element count divisible by number of parts");
24787 EC = EC.divideCoefficientBy(RequiredNumParts);
24788 return DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC),
24789 WideMask->getOperand(0));
24790}
24791
24792static SDValue performInterleavedMaskedStoreCombine(
24794 if (!DCI.isBeforeLegalize())
24795 return SDValue();
24796
24798 SDValue WideValue = MST->getValue();
24799
24800 // Bail out if the stored value has an unexpected number of uses, since we'll
24801 // have to perform manual interleaving and may as well just use normal masked
24802 // stores. Also, discard masked stores that are truncating or indexed.
24803 if (!WideValue.hasOneUse() || !ISD::isNormalMaskedStore(MST) ||
24804 !MST->isSimple() || !MST->getOffset().isUndef())
24805 return SDValue();
24806
24807 SmallVector<SDValue, 4> ValueInterleaveOps;
24808 if (!isSequentialConcatOfVectorInterleave(WideValue.getNode(),
24809 ValueInterleaveOps))
24810 return SDValue();
24811
24812 unsigned NumParts = ValueInterleaveOps.size();
24813 if (NumParts != 2 && NumParts != 4)
24814 return SDValue();
24815
24816 // At the moment we're unlikely to see a fixed-width vector interleave as
24817 // we usually generate shuffles instead.
24818 EVT SubVecTy = ValueInterleaveOps[0].getValueType();
24819 if (!SubVecTy.isScalableVT() ||
24820 SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
24821 !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
24822 return SDValue();
24823
24824 SDLoc DL(N);
24825 SDValue NarrowMask =
24826 getNarrowMaskForInterleavedOps(DAG, DL, MST->getMask(), NumParts);
24827 if (!NarrowMask)
24828 return SDValue();
24829
24830 const Intrinsic::ID IID =
24831 NumParts == 2 ? Intrinsic::aarch64_sve_st2 : Intrinsic::aarch64_sve_st4;
24832 SmallVector<SDValue, 8> NewStOps;
24833 NewStOps.append({MST->getChain(), DAG.getConstant(IID, DL, MVT::i32)});
24834 NewStOps.append(ValueInterleaveOps);
24835 NewStOps.append({NarrowMask, MST->getBasePtr()});
24836 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, NewStOps);
24837}
24838
24839static SDValue performMSTORECombine(SDNode *N,
24841 SelectionDAG &DAG,
24842 const AArch64Subtarget *Subtarget) {
24844 SDValue Value = MST->getValue();
24845 SDValue Mask = MST->getMask();
24846 SDLoc DL(N);
24847
24848 if (SDValue Res = performInterleavedMaskedStoreCombine(N, DCI, DAG))
24849 return Res;
24850
24851 // If this is a UZP1 followed by a masked store, fold this into a masked
24852 // truncating store. We can do this even if this is already a masked
24853 // truncstore.
24854 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
24855 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
24856 Value.getValueType().isInteger()) {
24857 Value = Value.getOperand(0);
24858 if (Value.getOpcode() == ISD::BITCAST) {
24859 EVT HalfVT =
24860 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
24861 EVT InVT = Value.getOperand(0).getValueType();
24862
24863 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
24864 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
24865 unsigned PgPattern = Mask->getConstantOperandVal(0);
24866
24867 // Ensure we can double the size of the predicate pattern
24868 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
24869 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
24870 MinSVESize) {
24871 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
24872 PgPattern);
24873 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
24874 MST->getBasePtr(), MST->getOffset(), Mask,
24875 MST->getMemoryVT(), MST->getMemOperand(),
24876 MST->getAddressingMode(),
24877 /*IsTruncating=*/true);
24878 }
24879 }
24880 }
24881 }
24882
24883 if (MST->isTruncatingStore()) {
24884 EVT ValueVT = Value->getValueType(0);
24885 EVT MemVT = MST->getMemoryVT();
24886 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
24887 return SDValue();
24888 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
24889 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
24890 MST->getOffset(), MST->getMask(),
24891 MST->getMemoryVT(), MST->getMemOperand(),
24892 MST->getAddressingMode(), true);
24893 }
24894 }
24895
24896 return SDValue();
24897}
24898
24899/// \return true if part of the index was folded into the Base.
24900static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
24901 SDLoc DL, SelectionDAG &DAG) {
24902 // This function assumes a vector of i64 indices.
24903 EVT IndexVT = Index.getValueType();
24904 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
24905 return false;
24906
24907 // Simplify:
24908 // BasePtr = Ptr
24909 // Index = X + splat(Offset)
24910 // ->
24911 // BasePtr = Ptr + Offset * scale.
24912 // Index = X
24913 if (Index.getOpcode() == ISD::ADD) {
24914 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
24915 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
24916 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
24917 Index = Index.getOperand(0);
24918 return true;
24919 }
24920 }
24921
24922 // Simplify:
24923 // BasePtr = Ptr
24924 // Index = (X + splat(Offset)) << splat(Shift)
24925 // ->
24926 // BasePtr = Ptr + (Offset << Shift) * scale)
24927 // Index = X << splat(shift)
24928 if (Index.getOpcode() == ISD::SHL &&
24929 Index.getOperand(0).getOpcode() == ISD::ADD) {
24930 SDValue Add = Index.getOperand(0);
24931 SDValue ShiftOp = Index.getOperand(1);
24932 SDValue OffsetOp = Add.getOperand(1);
24933 if (auto Shift = DAG.getSplatValue(ShiftOp))
24934 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
24935 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
24936 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
24937 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
24938 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
24939 Add.getOperand(0), ShiftOp);
24940 return true;
24941 }
24942 }
24943
24944 return false;
24945}
24946
24947// Analyse the specified address returning true if a more optimal addressing
24948// mode is available. When returning true all parameters are updated to reflect
24949// their recommended values.
24951 SDValue &BasePtr, SDValue &Index,
24952 SelectionDAG &DAG) {
24953 // Try to iteratively fold parts of the index into the base pointer to
24954 // simplify the index as much as possible.
24955 bool Changed = false;
24956 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
24957 Changed = true;
24958
24959 // Only consider element types that are pointer sized as smaller types can
24960 // be easily promoted.
24961 EVT IndexVT = Index.getValueType();
24962 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
24963 return Changed;
24964
24965 // Can indices be trivially shrunk?
24966 EVT DataVT = N->getOperand(1).getValueType();
24967 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
24968 // will later be re-extended to 64 bits in legalization
24969 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
24970 return Changed;
24971 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
24972 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
24973 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
24974 return true;
24975 }
24976
24977 // Match:
24978 // Index = step(const)
24979 int64_t Stride = 0;
24980 if (Index.getOpcode() == ISD::STEP_VECTOR) {
24981 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
24982 }
24983 // Match:
24984 // Index = step(const) << shift(const)
24985 else if (Index.getOpcode() == ISD::SHL &&
24986 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
24987 SDValue RHS = Index.getOperand(1);
24988 if (auto *Shift =
24990 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
24991 Stride = Step << Shift->getZExtValue();
24992 }
24993 }
24994
24995 // Return early because no supported pattern is found.
24996 if (Stride == 0)
24997 return Changed;
24998
24999 if (Stride < std::numeric_limits<int32_t>::min() ||
25000 Stride > std::numeric_limits<int32_t>::max())
25001 return Changed;
25002
25003 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
25004 unsigned MaxVScale =
25006 int64_t LastElementOffset =
25007 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
25008
25009 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
25010 LastElementOffset > std::numeric_limits<int32_t>::max())
25011 return Changed;
25012
25013 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
25014 // Stride does not scale explicitly by 'Scale', because it happens in
25015 // the gather/scatter addressing mode.
25016 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride, true));
25017 return true;
25018}
25019
25022 if (!DCI.isBeforeLegalize())
25023 return SDValue();
25025
25026 SDLoc DL(MGS);
25027 SDValue Chain = MGS->getChain();
25028 SDValue Scale = MGS->getScale();
25029 SDValue Index = MGS->getIndex();
25030 SDValue Mask = MGS->getMask();
25031 SDValue BasePtr = MGS->getBasePtr();
25032 ISD::MemIndexType IndexType = MGS->getIndexType();
25033
25034 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
25035 return SDValue();
25036
25037 // Here we catch such cases early and change MGATHER's IndexType to allow
25038 // the use of an Index that's more legalisation friendly.
25039 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
25040 SDValue PassThru = MGT->getPassThru();
25041 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
25042 return DAG.getMaskedGather(
25043 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
25044 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
25045 }
25046 if (auto *MSC = dyn_cast<MaskedScatterSDNode>(MGS)) {
25047 SDValue Data = MSC->getValue();
25048 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
25049 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
25050 DL, Ops, MSC->getMemOperand(), IndexType,
25051 MSC->isTruncatingStore());
25052 }
25053 auto *HG = cast<MaskedHistogramSDNode>(MGS);
25054 SDValue Ops[] = {Chain, HG->getInc(), Mask, BasePtr,
25055 Index, Scale, HG->getIntID()};
25056 return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),
25057 DL, Ops, HG->getMemOperand(), IndexType);
25058}
25059
25060/// Target-specific DAG combine function for NEON load/store intrinsics
25061/// to merge base address updates.
25064 SelectionDAG &DAG) {
25065 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
25066 return SDValue();
25067
25068 unsigned AddrOpIdx = N->getNumOperands() - 1;
25069 SDValue Addr = N->getOperand(AddrOpIdx);
25070
25071 // Search for a use of the address operand that is an increment.
25072 for (SDUse &Use : Addr->uses()) {
25073 SDNode *User = Use.getUser();
25074 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
25075 continue;
25076
25077 // Check that the add is independent of the load/store. Otherwise, folding
25078 // it would create a cycle.
25081 Visited.insert(Addr.getNode());
25082 Worklist.push_back(N);
25083 Worklist.push_back(User);
25084 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
25085 SDNode::hasPredecessorHelper(User, Visited, Worklist))
25086 continue;
25087
25088 // Find the new opcode for the updating load/store.
25089 bool IsStore = false;
25090 bool IsLaneOp = false;
25091 bool IsDupOp = false;
25092 unsigned NewOpc = 0;
25093 unsigned NumVecs = 0;
25094 unsigned IntNo = N->getConstantOperandVal(1);
25095 switch (IntNo) {
25096 default: llvm_unreachable("unexpected intrinsic for Neon base update");
25097 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
25098 NumVecs = 2; break;
25099 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
25100 NumVecs = 3; break;
25101 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
25102 NumVecs = 4; break;
25103 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
25104 NumVecs = 2; IsStore = true; break;
25105 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
25106 NumVecs = 3; IsStore = true; break;
25107 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
25108 NumVecs = 4; IsStore = true; break;
25109 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
25110 NumVecs = 2; break;
25111 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
25112 NumVecs = 3; break;
25113 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
25114 NumVecs = 4; break;
25115 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
25116 NumVecs = 2; IsStore = true; break;
25117 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
25118 NumVecs = 3; IsStore = true; break;
25119 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
25120 NumVecs = 4; IsStore = true; break;
25121 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
25122 NumVecs = 2; IsDupOp = true; break;
25123 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
25124 NumVecs = 3; IsDupOp = true; break;
25125 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
25126 NumVecs = 4; IsDupOp = true; break;
25127 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
25128 NumVecs = 2; IsLaneOp = true; break;
25129 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
25130 NumVecs = 3; IsLaneOp = true; break;
25131 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
25132 NumVecs = 4; IsLaneOp = true; break;
25133 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
25134 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
25135 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
25136 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
25137 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
25138 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
25139 }
25140
25141 EVT VecTy;
25142 if (IsStore)
25143 VecTy = N->getOperand(2).getValueType();
25144 else
25145 VecTy = N->getValueType(0);
25146
25147 // If the increment is a constant, it must match the memory ref size.
25148 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
25149 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
25150 uint32_t IncVal = CInc->getZExtValue();
25151 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
25152 if (IsLaneOp || IsDupOp)
25153 NumBytes /= VecTy.getVectorNumElements();
25154 if (IncVal != NumBytes)
25155 continue;
25156 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
25157 }
25159 Ops.push_back(N->getOperand(0)); // Incoming chain
25160 // Load lane and store have vector list as input.
25161 if (IsLaneOp || IsStore)
25162 for (unsigned i = 2; i < AddrOpIdx; ++i)
25163 Ops.push_back(N->getOperand(i));
25164 Ops.push_back(Addr); // Base register
25165 Ops.push_back(Inc);
25166
25167 // Return Types.
25168 EVT Tys[6];
25169 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
25170 unsigned n;
25171 for (n = 0; n < NumResultVecs; ++n)
25172 Tys[n] = VecTy;
25173 Tys[n++] = MVT::i64; // Type of write back register
25174 Tys[n] = MVT::Other; // Type of the chain
25175 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
25176
25178 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
25179 MemInt->getMemoryVT(),
25180 MemInt->getMemOperand());
25181
25182 // Update the uses.
25183 std::vector<SDValue> NewResults;
25184 for (unsigned i = 0; i < NumResultVecs; ++i) {
25185 NewResults.push_back(SDValue(UpdN.getNode(), i));
25186 }
25187 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
25188 DCI.CombineTo(N, NewResults);
25189 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
25190
25191 break;
25192 }
25193 return SDValue();
25194}
25195
25196// Checks to see if the value is the prescribed width and returns information
25197// about its extension mode.
25198static
25199bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
25200 ExtType = ISD::NON_EXTLOAD;
25201 switch(V.getNode()->getOpcode()) {
25202 default:
25203 return false;
25204 case ISD::LOAD: {
25205 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
25206 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
25207 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
25208 ExtType = LoadNode->getExtensionType();
25209 return true;
25210 }
25211 return false;
25212 }
25213 case ISD::AssertSext: {
25214 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
25215 if ((TypeNode->getVT() == MVT::i8 && width == 8)
25216 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
25217 ExtType = ISD::SEXTLOAD;
25218 return true;
25219 }
25220 return false;
25221 }
25222 case ISD::AssertZext: {
25223 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
25224 if ((TypeNode->getVT() == MVT::i8 && width == 8)
25225 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
25226 ExtType = ISD::ZEXTLOAD;
25227 return true;
25228 }
25229 return false;
25230 }
25231 case ISD::Constant:
25232 case ISD::TargetConstant: {
25233 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
25234 1LL << (width - 1);
25235 }
25236 }
25237
25238 return true;
25239}
25240
25241// This function does a whole lot of voodoo to determine if the tests are
25242// equivalent without and with a mask. Essentially what happens is that given a
25243// DAG resembling:
25244//
25245// +-------------+ +-------------+ +-------------+ +-------------+
25246// | Input | | AddConstant | | CompConstant| | CC |
25247// +-------------+ +-------------+ +-------------+ +-------------+
25248// | | | |
25249// V V | +----------+
25250// +-------------+ +----+ | |
25251// | ADD | |0xff| | |
25252// +-------------+ +----+ | |
25253// | | | |
25254// V V | |
25255// +-------------+ | |
25256// | AND | | |
25257// +-------------+ | |
25258// | | |
25259// +-----+ | |
25260// | | |
25261// V V V
25262// +-------------+
25263// | CMP |
25264// +-------------+
25265//
25266// The AND node may be safely removed for some combinations of inputs. In
25267// particular we need to take into account the extension type of the Input,
25268// the exact values of AddConstant, CompConstant, and CC, along with the nominal
25269// width of the input (this can work for any width inputs, the above graph is
25270// specific to 8 bits.
25271//
25272// The specific equations were worked out by generating output tables for each
25273// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
25274// problem was simplified by working with 4 bit inputs, which means we only
25275// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
25276// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
25277// patterns present in both extensions (0,7). For every distinct set of
25278// AddConstant and CompConstants bit patterns we can consider the masked and
25279// unmasked versions to be equivalent if the result of this function is true for
25280// all 16 distinct bit patterns of for the current extension type of Input (w0).
25281//
25282// sub w8, w0, w1
25283// and w10, w8, #0x0f
25284// cmp w8, w2
25285// cset w9, AArch64CC
25286// cmp w10, w2
25287// cset w11, AArch64CC
25288// cmp w9, w11
25289// cset w0, eq
25290// ret
25291//
25292// Since the above function shows when the outputs are equivalent it defines
25293// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
25294// would be expensive to run during compiles. The equations below were written
25295// in a test harness that confirmed they gave equivalent outputs to the above
25296// for all inputs function, so they can be used determine if the removal is
25297// legal instead.
25298//
25299// isEquivalentMaskless() is the code for testing if the AND can be removed
25300// factored out of the DAG recognition as the DAG can take several forms.
25301
25302static bool isEquivalentMaskless(unsigned CC, unsigned width,
25303 ISD::LoadExtType ExtType, int AddConstant,
25304 int CompConstant) {
25305 // By being careful about our equations and only writing the in term
25306 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
25307 // make them generally applicable to all bit widths.
25308 int MaxUInt = (1 << width);
25309
25310 // For the purposes of these comparisons sign extending the type is
25311 // equivalent to zero extending the add and displacing it by half the integer
25312 // width. Provided we are careful and make sure our equations are valid over
25313 // the whole range we can just adjust the input and avoid writing equations
25314 // for sign extended inputs.
25315 if (ExtType == ISD::SEXTLOAD)
25316 AddConstant -= (1 << (width-1));
25317
25318 switch(CC) {
25319 case AArch64CC::LE:
25320 case AArch64CC::GT:
25321 if ((AddConstant == 0) ||
25322 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
25323 (AddConstant >= 0 && CompConstant < 0) ||
25324 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
25325 return true;
25326 break;
25327 case AArch64CC::LT:
25328 case AArch64CC::GE:
25329 if ((AddConstant == 0) ||
25330 (AddConstant >= 0 && CompConstant <= 0) ||
25331 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
25332 return true;
25333 break;
25334 case AArch64CC::HI:
25335 case AArch64CC::LS:
25336 if ((AddConstant >= 0 && CompConstant < 0) ||
25337 (AddConstant <= 0 && CompConstant >= -1 &&
25338 CompConstant < AddConstant + MaxUInt))
25339 return true;
25340 break;
25341 case AArch64CC::PL:
25342 case AArch64CC::MI:
25343 if ((AddConstant == 0) ||
25344 (AddConstant > 0 && CompConstant <= 0) ||
25345 (AddConstant < 0 && CompConstant <= AddConstant))
25346 return true;
25347 break;
25348 case AArch64CC::LO:
25349 case AArch64CC::HS:
25350 if ((AddConstant >= 0 && CompConstant <= 0) ||
25351 (AddConstant <= 0 && CompConstant >= 0 &&
25352 CompConstant <= AddConstant + MaxUInt))
25353 return true;
25354 break;
25355 case AArch64CC::EQ:
25356 case AArch64CC::NE:
25357 if ((AddConstant > 0 && CompConstant < 0) ||
25358 (AddConstant < 0 && CompConstant >= 0 &&
25359 CompConstant < AddConstant + MaxUInt) ||
25360 (AddConstant >= 0 && CompConstant >= 0 &&
25361 CompConstant >= AddConstant) ||
25362 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
25363 return true;
25364 break;
25365 case AArch64CC::VS:
25366 case AArch64CC::VC:
25367 case AArch64CC::AL:
25368 case AArch64CC::NV:
25369 return true;
25370 case AArch64CC::Invalid:
25371 break;
25372 }
25373
25374 return false;
25375}
25376
25377// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
25378// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
25380 SDNode *AndNode, SelectionDAG &DAG,
25381 unsigned CCIndex, unsigned CmpIndex,
25382 unsigned CC) {
25383 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
25384 if (!SubsC)
25385 return SDValue();
25386
25387 APInt SubsAP = SubsC->getAPIntValue();
25388 if (CC == AArch64CC::HI) {
25389 if (!SubsAP.isMask())
25390 return SDValue();
25391 } else if (CC == AArch64CC::LO) {
25392 if (!SubsAP.isPowerOf2())
25393 return SDValue();
25394 } else
25395 return SDValue();
25396
25398 if (!AndC)
25399 return SDValue();
25400
25401 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
25402
25403 SDLoc DL(N);
25404 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
25405 SDValue ANDS = DAG.getNode(
25406 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
25407 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
25408 SDValue AArch64_CC =
25410 N->getOperand(CCIndex)->getValueType(0));
25411
25412 // For now, only performCSELCombine and performBRCONDCombine call this
25413 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
25414 // operands. So just init the ops direct to simplify the code. If we have some
25415 // other case with different CCIndex, CmpIndex, we need to use for loop to
25416 // rewrite the code here.
25417 // TODO: Do we need to assert number of operand is 4 here?
25418 assert((CCIndex == 2 && CmpIndex == 3) &&
25419 "Expected CCIndex to be 2 and CmpIndex to be 3.");
25420 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
25421 ANDS.getValue(1)};
25422 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
25423}
25424
25425static
25428 SelectionDAG &DAG, unsigned CCIndex,
25429 unsigned CmpIndex) {
25430 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
25431 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
25432 unsigned CondOpcode = SubsNode->getOpcode();
25433
25434 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
25435 !SubsNode->hasOneUse())
25436 return SDValue();
25437
25438 // There is a SUBS feeding this condition. Is it fed by a mask we can
25439 // use?
25440
25441 SDNode *AndNode = SubsNode->getOperand(0).getNode();
25442 unsigned MaskBits = 0;
25443
25444 if (AndNode->getOpcode() != ISD::AND)
25445 return SDValue();
25446
25447 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
25448 CmpIndex, CC))
25449 return Val;
25450
25451 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
25452 uint32_t CNV = CN->getZExtValue();
25453 if (CNV == 255)
25454 MaskBits = 8;
25455 else if (CNV == 65535)
25456 MaskBits = 16;
25457 }
25458
25459 if (!MaskBits)
25460 return SDValue();
25461
25462 SDValue AddValue = AndNode->getOperand(0);
25463
25464 if (AddValue.getOpcode() != ISD::ADD)
25465 return SDValue();
25466
25467 // The basic dag structure is correct, grab the inputs and validate them.
25468
25469 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
25470 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
25471 SDValue SubsInputValue = SubsNode->getOperand(1);
25472
25473 // The mask is present and the provenance of all the values is a smaller type,
25474 // lets see if the mask is superfluous.
25475
25476 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
25477 !isa<ConstantSDNode>(SubsInputValue.getNode()))
25478 return SDValue();
25479
25480 ISD::LoadExtType ExtType;
25481
25482 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
25483 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
25484 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
25485 return SDValue();
25486
25487 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
25488 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
25489 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
25490 return SDValue();
25491
25492 // The AND is not necessary, remove it.
25493
25494 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
25495 SubsNode->getValueType(1));
25496 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
25497
25498 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
25499 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
25500
25501 return SDValue(N, 0);
25502}
25503
25504// Optimize compare with zero and branch.
25507 SelectionDAG &DAG) {
25509 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
25510 // will not be produced, as they are conditional branch instructions that do
25511 // not set flags.
25512 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
25513 return SDValue();
25514
25515 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
25516 N = NV.getNode();
25517 SDValue Chain = N->getOperand(0);
25518 SDValue Dest = N->getOperand(1);
25519 SDValue CCVal = N->getOperand(2);
25520 SDValue Cmp = N->getOperand(3);
25521
25522 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
25523 unsigned CC = CCVal->getAsZExtVal();
25524 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
25525 return SDValue();
25526
25527 // Fold away brcond(NE, cmp(csel(1, 0, CC, Cmp), 1)) -> brcond(~CC, Cmp)
25528 if (isCMP(Cmp) && CC == AArch64CC::NE && isOneConstant(Cmp.getOperand(1))) {
25529 SDValue CSel = Cmp.getOperand(0);
25530 auto CSelCC = getCSETCondCode(CSel);
25531 if (CSelCC) {
25532 SDLoc DL(N);
25533 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), Chain, Dest,
25534 getCondCode(DAG, getInvertedCondCode(*CSelCC)),
25535 CSel.getOperand(3));
25536 }
25537 }
25538
25539 unsigned CmpOpc = Cmp.getOpcode();
25540 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
25541 return SDValue();
25542
25543 // Only attempt folding if there is only one use of the flag and no use of the
25544 // value.
25545 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
25546 return SDValue();
25547
25548 SDValue LHS = Cmp.getOperand(0);
25549 SDValue RHS = Cmp.getOperand(1);
25550
25551 assert(LHS.getValueType() == RHS.getValueType() &&
25552 "Expected the value type to be the same for both operands!");
25553 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
25554 return SDValue();
25555
25556 if (isNullConstant(LHS))
25557 std::swap(LHS, RHS);
25558
25559 if (!isNullConstant(RHS))
25560 return SDValue();
25561
25562 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
25563 LHS.getOpcode() == ISD::SRL)
25564 return SDValue();
25565
25566 // Fold the compare into the branch instruction.
25567 SDValue BR;
25568 if (CC == AArch64CC::EQ)
25569 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
25570 else
25571 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
25572
25573 // Do not add new nodes to DAG combiner worklist.
25574 DCI.CombineTo(N, BR, false);
25575
25576 return SDValue();
25577}
25578
25580 unsigned CC = N->getConstantOperandVal(2);
25581 SDValue SUBS = N->getOperand(3);
25582 SDValue Zero, CTTZ;
25583
25584 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
25585 Zero = N->getOperand(0);
25586 CTTZ = N->getOperand(1);
25587 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
25588 Zero = N->getOperand(1);
25589 CTTZ = N->getOperand(0);
25590 } else
25591 return SDValue();
25592
25593 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
25594 (CTTZ.getOpcode() == ISD::TRUNCATE &&
25595 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
25596 return SDValue();
25597
25598 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
25599 "Illegal type in CTTZ folding");
25600
25601 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
25602 return SDValue();
25603
25604 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
25605 ? CTTZ.getOperand(0).getOperand(0)
25606 : CTTZ.getOperand(0);
25607
25608 if (X != SUBS.getOperand(0))
25609 return SDValue();
25610
25611 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
25612 ? CTTZ.getOperand(0).getValueSizeInBits()
25613 : CTTZ.getValueSizeInBits();
25614 SDValue BitWidthMinusOne =
25615 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
25616 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
25617 BitWidthMinusOne);
25618}
25619
25620// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
25621// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
25622// Where x and y are constants and x != y
25623
25624// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
25625// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
25626// Where x and y are constants and x != y
25628 SDValue L = Op->getOperand(0);
25629 SDValue R = Op->getOperand(1);
25630 AArch64CC::CondCode OpCC =
25631 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
25632
25633 SDValue OpCmp = Op->getOperand(3);
25634 if (!isCMP(OpCmp))
25635 return SDValue();
25636
25637 SDValue CmpLHS = OpCmp.getOperand(0);
25638 SDValue CmpRHS = OpCmp.getOperand(1);
25639
25640 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
25641 std::swap(CmpLHS, CmpRHS);
25642 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
25643 return SDValue();
25644
25645 SDValue X = CmpLHS->getOperand(0);
25646 SDValue Y = CmpLHS->getOperand(1);
25647 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
25648 return SDValue();
25649 }
25650
25651 // If one of the constant is opaque constant, x,y sdnode is still different
25652 // but the real value maybe the same. So check APInt here to make sure the
25653 // code is correct.
25656 if (CX->getAPIntValue() == CY->getAPIntValue())
25657 return SDValue();
25658
25660 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
25661 SDValue Cond = CmpLHS->getOperand(3);
25662
25663 if (CmpRHS == Y)
25665 else if (CmpRHS != X)
25666 return SDValue();
25667
25668 if (OpCC == AArch64CC::NE)
25670 else if (OpCC != AArch64CC::EQ)
25671 return SDValue();
25672
25673 SDLoc DL(Op);
25674 EVT VT = Op->getValueType(0);
25675
25676 SDValue CCValue = getCondCode(DAG, CC);
25677 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
25678}
25679
25680// Reassociate the true/false expressions of a CSEL instruction to obtain a
25681// common subexpression with the comparison instruction. For example, change
25682// (CSEL (ADD (ADD x y) -c) f LO (SUBS x c)) to
25683// (CSEL (ADD (SUBS x c) y) f LO (SUBS x c)) such that (SUBS x c) is a common
25684// subexpression.
25686 SDValue SubsNode = N->getOperand(3);
25687 if (SubsNode.getOpcode() != AArch64ISD::SUBS || !SubsNode.hasOneUse())
25688 return SDValue();
25689
25690 SDValue CmpOpToMatch = SubsNode.getOperand(1);
25691 SDValue CmpOpOther = SubsNode.getOperand(0);
25692 EVT VT = N->getValueType(0);
25693
25694 unsigned ExpectedOpcode;
25695 SDValue ExpectedOp;
25696 SDValue SubsOp;
25697 auto *CmpOpConst = dyn_cast<ConstantSDNode>(CmpOpToMatch);
25698 if (CmpOpConst) {
25699 ExpectedOpcode = ISD::ADD;
25700 ExpectedOp =
25701 DAG.getConstant(-CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
25702 CmpOpConst->getValueType(0));
25703 SubsOp = DAG.getConstant(CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
25704 CmpOpConst->getValueType(0));
25705 } else {
25706 ExpectedOpcode = ISD::SUB;
25707 ExpectedOp = CmpOpToMatch;
25708 SubsOp = CmpOpToMatch;
25709 }
25710
25711 // Get the operand that can be reassociated with the SUBS instruction.
25712 auto GetReassociationOp = [&](SDValue Op, SDValue ExpectedOp) {
25713 if (Op.getOpcode() != ExpectedOpcode)
25714 return SDValue();
25715 if (Op.getOperand(0).getOpcode() != ISD::ADD ||
25716 !Op.getOperand(0).hasOneUse())
25717 return SDValue();
25718 SDValue X = Op.getOperand(0).getOperand(0);
25719 SDValue Y = Op.getOperand(0).getOperand(1);
25720 if (X != CmpOpOther)
25721 std::swap(X, Y);
25722 if (X != CmpOpOther)
25723 return SDValue();
25724 if (ExpectedOp != Op.getOperand(1))
25725 return SDValue();
25726 return Y;
25727 };
25728
25729 // Try the reassociation using the given constant and condition code.
25730 auto Fold = [&](AArch64CC::CondCode NewCC, SDValue ExpectedOp,
25731 SDValue SubsOp) {
25732 SDValue TReassocOp = GetReassociationOp(N->getOperand(0), ExpectedOp);
25733 SDValue FReassocOp = GetReassociationOp(N->getOperand(1), ExpectedOp);
25734 if (!TReassocOp && !FReassocOp)
25735 return SDValue();
25736
25737 SDValue NewCmp =
25738 DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
25739 DAG.getVTList(VT, FlagsVT), CmpOpOther, SubsOp);
25740
25741 auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
25742 if (!ReassocOp)
25743 return N->getOperand(OpNum);
25744 SDValue Res = DAG.getNode(ISD::ADD, SDLoc(N->getOperand(OpNum)), VT,
25745 NewCmp.getValue(0), ReassocOp);
25746 DAG.ReplaceAllUsesWith(N->getOperand(OpNum), Res);
25747 return Res;
25748 };
25749
25750 SDValue TValReassoc = Reassociate(TReassocOp, 0);
25751 SDValue FValReassoc = Reassociate(FReassocOp, 1);
25752 return DAG.getNode(AArch64ISD::CSEL, SDLoc(N), VT, TValReassoc, FValReassoc,
25753 getCondCode(DAG, NewCC), NewCmp.getValue(1));
25754 };
25755
25756 auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
25757
25758 // First, try to eliminate the compare instruction by searching for a
25759 // subtraction with the same constant.
25760 if (SDValue R = Fold(CC, ExpectedOp, SubsOp))
25761 return R;
25762
25763 if (!CmpOpConst) {
25764 // Try again with the operands of the SUBS instruction and the condition
25765 // swapped. Due to canonicalization, this only helps for non-constant
25766 // operands of the SUBS instruction.
25767 std::swap(CmpOpToMatch, CmpOpOther);
25768 if (SDValue R = Fold(getSwappedCondition(CC), CmpOpToMatch, CmpOpToMatch))
25769 return R;
25770 return SDValue();
25771 }
25772
25773 if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && !CmpOpConst->isZero())
25774 return SDValue();
25775
25776 // Next, search for a subtraction with a slightly different constant. By
25777 // adjusting the condition code, we can still eliminate the compare
25778 // instruction. Adjusting the constant is only valid if it does not result
25779 // in signed/unsigned wrap for signed/unsigned comparisons, respectively.
25780 // Since such comparisons are trivially true/false, we should not encounter
25781 // them here but check for them nevertheless to be on the safe side.
25782 auto CheckedFold = [&](bool Check, APInt NewCmpConst,
25783 AArch64CC::CondCode NewCC) {
25784 auto ExpectedOp = DAG.getConstant(-NewCmpConst, SDLoc(CmpOpConst),
25785 CmpOpConst->getValueType(0));
25786 auto SubsOp = DAG.getConstant(NewCmpConst, SDLoc(CmpOpConst),
25787 CmpOpConst->getValueType(0));
25788 return Check ? Fold(NewCC, ExpectedOp, SubsOp) : SDValue();
25789 };
25790 switch (CC) {
25791 case AArch64CC::EQ:
25792 case AArch64CC::LS:
25793 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
25794 CmpOpConst->getAPIntValue() + 1, AArch64CC::LO);
25795 case AArch64CC::NE:
25796 case AArch64CC::HI:
25797 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
25798 CmpOpConst->getAPIntValue() + 1, AArch64CC::HS);
25799 case AArch64CC::LO:
25800 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
25801 CmpOpConst->getAPIntValue() - 1, AArch64CC::LS);
25802 case AArch64CC::HS:
25803 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
25804 CmpOpConst->getAPIntValue() - 1, AArch64CC::HI);
25805 case AArch64CC::LT:
25806 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
25807 CmpOpConst->getAPIntValue() - 1, AArch64CC::LE);
25808 case AArch64CC::LE:
25809 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
25810 CmpOpConst->getAPIntValue() + 1, AArch64CC::LT);
25811 case AArch64CC::GT:
25812 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
25813 CmpOpConst->getAPIntValue() + 1, AArch64CC::GE);
25814 case AArch64CC::GE:
25815 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
25816 CmpOpConst->getAPIntValue() - 1, AArch64CC::GT);
25817 default:
25818 return SDValue();
25819 }
25820}
25821
25823 AArch64CC::CondCode OpCC =
25824 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
25825
25826 if (OpCC != AArch64CC::NE)
25827 return SDValue();
25828
25829 SDValue PTest = Op->getOperand(3);
25830 if (PTest.getOpcode() != AArch64ISD::PTEST_ANY)
25831 return SDValue();
25832
25833 SDValue TruePred = PTest.getOperand(0);
25834 SDValue AnyPred = PTest.getOperand(1);
25835
25836 if (TruePred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
25837 TruePred = TruePred.getOperand(0);
25838
25839 if (AnyPred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
25840 AnyPred = AnyPred.getOperand(0);
25841
25842 if (TruePred != AnyPred && !isAllActivePredicate(DAG, TruePred))
25843 return SDValue();
25844
25845 SDValue LastB = Op->getOperand(0);
25846 SDValue Default = Op->getOperand(1);
25847
25848 if (LastB.getOpcode() != AArch64ISD::LASTB || LastB.getOperand(0) != AnyPred)
25849 return SDValue();
25850
25851 return DAG.getNode(AArch64ISD::CLASTB_N, SDLoc(Op), Op->getValueType(0),
25852 AnyPred, Default, LastB.getOperand(1));
25853}
25854
25855// Optimize CSEL instructions
25858 SelectionDAG &DAG) {
25859 // CSEL x, x, cc -> x
25860 if (N->getOperand(0) == N->getOperand(1))
25861 return N->getOperand(0);
25862
25863 if (SDValue R = foldCSELOfCSEL(N, DAG))
25864 return R;
25865
25866 // Try to reassociate the true/false expressions so that we can do CSE with
25867 // a SUBS instruction used to perform the comparison.
25869 return R;
25870
25871 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
25872 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
25873 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
25874 return Folded;
25875
25876 // CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x)
25877 // if SUB(y, x) already exists and we can produce a swapped predicate for cc.
25878 SDValue Cond = N->getOperand(3);
25879 if (DCI.isAfterLegalizeDAG() && Cond.getOpcode() == AArch64ISD::SUBS &&
25880 Cond.hasOneUse() && Cond->hasNUsesOfValue(0, 0) &&
25881 DAG.doesNodeExist(ISD::SUB, N->getVTList(),
25882 {Cond.getOperand(1), Cond.getOperand(0)}) &&
25883 !DAG.doesNodeExist(ISD::SUB, N->getVTList(),
25884 {Cond.getOperand(0), Cond.getOperand(1)}) &&
25885 !isNullConstant(Cond.getOperand(1))) {
25886 AArch64CC::CondCode OldCond =
25887 static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
25888 AArch64CC::CondCode NewCond = getSwappedCondition(OldCond);
25889 if (NewCond != AArch64CC::AL) {
25890 SDLoc DL(N);
25891 SDValue Sub = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
25892 Cond.getOperand(1), Cond.getOperand(0));
25893 return DAG.getNode(AArch64ISD::CSEL, DL, N->getVTList(), N->getOperand(0),
25894 N->getOperand(1), getCondCode(DAG, NewCond),
25895 Sub.getValue(1));
25896 }
25897 }
25898
25899 // CSEL a, b, cc, SUBS(SUB(x,y), 0) -> CSEL a, b, cc, SUBS(x,y) if cc doesn't
25900 // use overflow flags, to avoid the comparison with zero. In case of success,
25901 // this also replaces the original SUB(x,y) with the newly created SUBS(x,y).
25902 // NOTE: Perhaps in the future use performFlagSettingCombine to replace SUB
25903 // nodes with their SUBS equivalent as is already done for other flag-setting
25904 // operators, in which case doing the replacement here becomes redundant.
25905 if (Cond.getOpcode() == AArch64ISD::SUBS && Cond->hasNUsesOfValue(1, 1) &&
25906 isNullConstant(Cond.getOperand(1))) {
25907 SDValue Sub = Cond.getOperand(0);
25909 static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
25910 if (Sub.getOpcode() == ISD::SUB &&
25911 (CC == AArch64CC::EQ || CC == AArch64CC::NE || CC == AArch64CC::MI ||
25912 CC == AArch64CC::PL)) {
25913 SDLoc DL(N);
25914 SDValue Subs = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
25915 Sub.getOperand(0), Sub.getOperand(1));
25916 DCI.CombineTo(Sub.getNode(), Subs);
25917 DCI.CombineTo(Cond.getNode(), Subs, Subs.getValue(1));
25918 return SDValue(N, 0);
25919 }
25920 }
25921
25922 // CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z
25923 if (SDValue CondLast = foldCSELofLASTB(N, DAG))
25924 return CondLast;
25925
25926 return performCONDCombine(N, DCI, DAG, 2, 3);
25927}
25928
25929// Try to re-use an already extended operand of a vector SetCC feeding a
25930// extended select. Doing so avoids requiring another full extension of the
25931// SET_CC result when lowering the select.
25933 EVT Op0MVT = Op->getOperand(0).getValueType();
25934 if (!Op0MVT.isVector() || Op->use_empty())
25935 return SDValue();
25936
25937 // Make sure that all uses of Op are VSELECTs with result matching types where
25938 // the result type has a larger element type than the SetCC operand.
25939 SDNode *FirstUse = *Op->user_begin();
25940 if (FirstUse->getOpcode() != ISD::VSELECT)
25941 return SDValue();
25942 EVT UseMVT = FirstUse->getValueType(0);
25943 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
25944 return SDValue();
25945 if (any_of(Op->users(), [&UseMVT](const SDNode *N) {
25946 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
25947 }))
25948 return SDValue();
25949
25950 APInt V;
25951 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
25952 return SDValue();
25953
25954 SDLoc DL(Op);
25955 SDValue Op0ExtV;
25956 SDValue Op1ExtV;
25957 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
25958 // Check if the first operand of the SET_CC is already extended. If it is,
25959 // split the SET_CC and re-use the extended version of the operand.
25960 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
25961 Op->getOperand(0));
25962 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
25963 Op->getOperand(0));
25964 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
25965 Op0ExtV = SDValue(Op0SExt, 0);
25966 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
25967 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
25968 Op0ExtV = SDValue(Op0ZExt, 0);
25969 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
25970 } else
25971 return SDValue();
25972
25973 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
25974 Op0ExtV, Op1ExtV, Op->getOperand(2));
25975}
25976
25977static SDValue
25979 SelectionDAG &DAG) {
25980 SDValue Vec = N->getOperand(0);
25981 if (DCI.isBeforeLegalize() &&
25982 Vec.getValueType().getVectorElementType() == MVT::i1 &&
25985 SDLoc DL(N);
25986 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
25987 DAG);
25988 }
25989
25990 return SDValue();
25991}
25992
25995 SelectionDAG &DAG) {
25996 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
25997 SDValue LHS = N->getOperand(0);
25998 SDValue RHS = N->getOperand(1);
25999 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
26000 SDLoc DL(N);
26001 EVT VT = N->getValueType(0);
26002
26003 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
26004 return V;
26005
26006 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
26007 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
26008 LHS->getOpcode() == AArch64ISD::CSEL &&
26009 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
26010 LHS->hasOneUse()) {
26011 // Invert CSEL's condition.
26012 auto OldCond =
26013 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
26014 auto NewCond = getInvertedCondCode(OldCond);
26015
26016 // csel 0, 1, !cond, X
26017 SDValue CSEL = DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(),
26018 LHS.getOperand(0), LHS.getOperand(1),
26019 getCondCode(DAG, NewCond), LHS.getOperand(3));
26020 return DAG.getZExtOrTrunc(CSEL, DL, VT);
26021 }
26022
26023 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
26024 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
26025 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
26026 LHS->hasOneUse()) {
26027 EVT TstVT = LHS->getValueType(0);
26028 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64 &&
26029 LHS->getConstantOperandVal(1) < TstVT.getFixedSizeInBits()) {
26030 // this pattern will get better opt in emitComparison
26031 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
26032 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
26033 DAG.getSignedConstant(TstImm, DL, TstVT));
26034 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
26035 }
26036 }
26037
26038 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
26039 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
26040 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
26041 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
26042 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
26043 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
26045 LHS->getOpcode() == ISD::BITCAST) {
26046 EVT ToVT = LHS->getValueType(0);
26047 EVT FromVT = LHS->getOperand(0).getValueType();
26048 if (FromVT.isFixedLengthVector() &&
26049 FromVT.getVectorElementType() == MVT::i1) {
26050 bool IsNull = isNullConstant(RHS);
26051 LHS = DAG.getNode(IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND,
26052 DL, MVT::i1, LHS->getOperand(0));
26053 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
26054 LHS);
26055 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
26056 }
26057 }
26058
26059 // Try to perform the memcmp when the result is tested for [in]equality with 0
26060 if (SDValue V = performOrXorChainCombine(N, DAG))
26061 return V;
26062
26063 EVT CmpVT = LHS.getValueType();
26064
26065 // NOTE: This exists as a combine only because it proved too awkward to match
26066 // splat(1) across all the NEON types during isel.
26067 APInt SplatLHSVal;
26068 if (CmpVT.isInteger() && Cond == ISD::SETGT &&
26069 ISD::isConstantSplatVector(LHS.getNode(), SplatLHSVal) &&
26070 SplatLHSVal.isOne())
26071 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, CmpVT), RHS, ISD::SETGE);
26072
26073 return SDValue();
26074}
26075
26076// Replace a flag-setting operator (eg ANDS) with the generic version
26077// (eg AND) if the flag is unused.
26080 unsigned GenericOpcode) {
26081 SDLoc DL(N);
26082 SDValue LHS = N->getOperand(0);
26083 SDValue RHS = N->getOperand(1);
26084 EVT VT = N->getValueType(0);
26085
26086 // If the flag result isn't used, convert back to a generic opcode.
26087 if (!N->hasAnyUseOfValue(1)) {
26088 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
26089 return DCI.CombineTo(N, Res, SDValue(N, 1));
26090 }
26091
26092 // Combine identical generic nodes into this node, re-using the result.
26093 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
26094 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
26095 DCI.CombineTo(Generic, SDValue(N, 0));
26096
26097 return SDValue();
26098}
26099
26101 // setcc_merge_zero pred
26102 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
26103 // => extract_subvector (inner setcc_merge_zero)
26104 SDValue Pred = N->getOperand(0);
26105 SDValue LHS = N->getOperand(1);
26106 SDValue RHS = N->getOperand(2);
26107 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
26108
26109 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
26110 LHS->getOpcode() != ISD::SIGN_EXTEND)
26111 return SDValue();
26112
26113 SDValue Extract = LHS->getOperand(0);
26114 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
26115 Extract->getValueType(0) != N->getValueType(0) ||
26116 Extract->getConstantOperandVal(1) != 0)
26117 return SDValue();
26118
26119 SDValue InnerSetCC = Extract->getOperand(0);
26120 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
26121 return SDValue();
26122
26123 // By this point we've effectively got
26124 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
26125 // lanes are already zero then the trunc(sext()) sequence is redundant and we
26126 // can operate on A directly.
26127 SDValue InnerPred = InnerSetCC.getOperand(0);
26128 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
26129 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
26130 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
26131 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
26132 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
26133 return Extract;
26134
26135 return SDValue();
26136}
26137
26138static bool isSignExtInReg(const SDValue &V) {
26139 if (V.getOpcode() != AArch64ISD::VASHR ||
26140 V.getOperand(0).getOpcode() != AArch64ISD::VSHL)
26141 return false;
26142
26143 unsigned BitWidth = V->getValueType(0).getScalarSizeInBits();
26144 unsigned ShiftAmtR = V.getConstantOperandVal(1);
26145 unsigned ShiftAmtL = V.getOperand(0).getConstantOperandVal(1);
26146 return (ShiftAmtR == ShiftAmtL && ShiftAmtR == (BitWidth - 1));
26147}
26148
26149static SDValue
26151 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
26152 "Unexpected opcode!");
26153
26154 SelectionDAG &DAG = DCI.DAG;
26155 SDValue Pred = N->getOperand(0);
26156 SDValue LHS = N->getOperand(1);
26157 SDValue RHS = N->getOperand(2);
26158 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
26159
26160 if (SDValue V = performSetCCPunpkCombine(N, DAG))
26161 return V;
26162
26163 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
26164 LHS->getOpcode() == ISD::SIGN_EXTEND &&
26165 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
26166 // setcc_merge_zero(
26167 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
26168 // => setcc_merge_zero(pred, ...)
26169 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
26170 LHS->getOperand(0)->getOperand(0) == Pred)
26171 return LHS->getOperand(0);
26172
26173 // setcc_merge_zero(
26174 // all_active, extend(nxvNi1 ...), != splat(0))
26175 // -> nxvNi1 ...
26176 if (isAllActivePredicate(DAG, Pred))
26177 return LHS->getOperand(0);
26178
26179 // setcc_merge_zero(
26180 // pred, extend(nxvNi1 ...), != splat(0))
26181 // -> nxvNi1 and(pred, ...)
26182 if (DCI.isAfterLegalizeDAG())
26183 // Do this after legalization to allow more folds on setcc_merge_zero
26184 // to be recognized.
26185 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
26186 LHS->getOperand(0), Pred);
26187 }
26188
26189 // setcc_merge_zero(
26190 // pred, insert_subvector(undef, signext_inreg(vNi1), 0), != splat(0))
26191 // => setcc_merge_zero(
26192 // pred, insert_subvector(undef, shl(vNi1), 0), != splat(0))
26193 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
26194 LHS->getOpcode() == ISD::INSERT_SUBVECTOR && LHS.hasOneUse()) {
26195 SDValue L0 = LHS->getOperand(0);
26196 SDValue L1 = LHS->getOperand(1);
26197 SDValue L2 = LHS->getOperand(2);
26198
26199 if (L0.getOpcode() == ISD::UNDEF && isNullConstant(L2) &&
26200 isSignExtInReg(L1)) {
26201 SDLoc DL(N);
26202 SDValue Shl = L1.getOperand(0);
26204 LHS.getValueType(), L0, Shl, L2);
26205 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, N->getValueType(0),
26206 Pred, NewLHS, RHS, N->getOperand(3));
26207 }
26208 }
26209
26210 return SDValue();
26211}
26212
26213// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
26214// as well as whether the test should be inverted. This code is required to
26215// catch these cases (as opposed to standard dag combines) because
26216// AArch64ISD::TBZ is matched during legalization.
26217static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
26218 SelectionDAG &DAG) {
26219
26220 if (!Op->hasOneUse())
26221 return Op;
26222
26223 // We don't handle undef/constant-fold cases below, as they should have
26224 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
26225 // etc.)
26226
26227 // (tbz (trunc x), b) -> (tbz x, b)
26228 // This case is just here to enable more of the below cases to be caught.
26229 if (Op->getOpcode() == ISD::TRUNCATE &&
26230 Bit < Op->getValueType(0).getSizeInBits()) {
26231 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26232 }
26233
26234 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
26235 if (Op->getOpcode() == ISD::ANY_EXTEND &&
26236 Bit < Op->getOperand(0).getValueSizeInBits()) {
26237 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26238 }
26239
26240 if (Op->getNumOperands() != 2)
26241 return Op;
26242
26243 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
26244 if (!C)
26245 return Op;
26246
26247 switch (Op->getOpcode()) {
26248 default:
26249 return Op;
26250
26251 // (tbz (and x, m), b) -> (tbz x, b)
26252 case ISD::AND:
26253 if ((C->getZExtValue() >> Bit) & 1)
26254 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26255 return Op;
26256
26257 // (tbz (shl x, c), b) -> (tbz x, b-c)
26258 case ISD::SHL:
26259 if (C->getZExtValue() <= Bit &&
26260 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
26261 Bit = Bit - C->getZExtValue();
26262 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26263 }
26264 return Op;
26265
26266 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
26267 case ISD::SRA:
26268 Bit = Bit + C->getZExtValue();
26269 if (Bit >= Op->getValueType(0).getSizeInBits())
26270 Bit = Op->getValueType(0).getSizeInBits() - 1;
26271 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26272
26273 // (tbz (srl x, c), b) -> (tbz x, b+c)
26274 case ISD::SRL:
26275 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
26276 Bit = Bit + C->getZExtValue();
26277 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26278 }
26279 return Op;
26280
26281 // (tbz (xor x, -1), b) -> (tbnz x, b)
26282 case ISD::XOR:
26283 if ((C->getZExtValue() >> Bit) & 1)
26284 Invert = !Invert;
26285 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26286 }
26287}
26288
26289// Optimize test single bit zero/non-zero and branch.
26292 SelectionDAG &DAG) {
26293 unsigned Bit = N->getConstantOperandVal(2);
26294 bool Invert = false;
26295 SDValue TestSrc = N->getOperand(1);
26296 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
26297
26298 if (TestSrc == NewTestSrc)
26299 return SDValue();
26300
26301 unsigned NewOpc = N->getOpcode();
26302 if (Invert) {
26303 if (NewOpc == AArch64ISD::TBZ)
26304 NewOpc = AArch64ISD::TBNZ;
26305 else {
26306 assert(NewOpc == AArch64ISD::TBNZ);
26307 NewOpc = AArch64ISD::TBZ;
26308 }
26309 }
26310
26311 SDLoc DL(N);
26312 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
26313 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
26314}
26315
26316// Swap vselect operands where it may allow a predicated operation to achieve
26317// the `sel`.
26318//
26319// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
26320// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
26322 auto SelectA = N->getOperand(1);
26323 auto SelectB = N->getOperand(2);
26324 auto NTy = N->getValueType(0);
26325
26326 if (!NTy.isScalableVector())
26327 return SDValue();
26328 SDValue SetCC = N->getOperand(0);
26329 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
26330 return SDValue();
26331
26332 switch (SelectB.getOpcode()) {
26333 default:
26334 return SDValue();
26335 case ISD::FMUL:
26336 case ISD::FSUB:
26337 case ISD::FADD:
26338 break;
26339 }
26340 if (SelectA != SelectB.getOperand(0))
26341 return SDValue();
26342
26343 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
26344 ISD::CondCode InverseCC =
26346 auto InverseSetCC =
26347 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
26348 SetCC.getOperand(1), InverseCC);
26349
26350 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
26351 {InverseSetCC, SelectB, SelectA});
26352}
26353
26354// vselect (v1i1 setcc) ->
26355// vselect (v1iXX setcc) (XX is the size of the compared operand type)
26356// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
26357// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
26358// such VSELECT.
26360 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
26361 return SwapResult;
26362
26363 SDValue N0 = N->getOperand(0);
26364 SDValue IfTrue = N->getOperand(1);
26365 SDValue IfFalse = N->getOperand(2);
26366 EVT ResVT = N->getValueType(0);
26367 EVT CCVT = N0.getValueType();
26368
26369 if (isAllActivePredicate(DAG, N0))
26370 return N->getOperand(1);
26371
26372 if (isAllInactivePredicate(N0))
26373 return N->getOperand(2);
26374
26375 if (isMergePassthruOpcode(IfTrue.getOpcode()) && IfTrue.hasOneUse()) {
26376 // vselect A, (merge_pasthru_op all_active, B,{Bn,} -), C
26377 // vselect A, (merge_pasthru_op -, B,{Bn,} undef), C
26378 // vselect A, (merge_pasthru_op A, B,{Bn,} -), C
26379 // -> merge_pasthru_op A, B,{Bn,} C
26380 if (isAllActivePredicate(DAG, IfTrue->getOperand(0)) ||
26381 IfTrue->getOperand(IfTrue.getNumOperands() - 1).isUndef() ||
26382 IfTrue->getOperand(0) == N0) {
26384 Ops[0] = N0;
26385 Ops[IfTrue.getNumOperands() - 1] = IfFalse;
26386
26387 return DAG.getNode(IfTrue.getOpcode(), SDLoc(N), ResVT, Ops);
26388 }
26389 }
26390
26391 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
26392 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
26393 // supported types.
26394 SDValue SetCC = N->getOperand(0);
26395 if (SetCC.getOpcode() == ISD::SETCC &&
26396 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
26397 SDValue CmpLHS = SetCC.getOperand(0);
26398 EVT VT = CmpLHS.getValueType();
26399 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
26400 SDNode *SplatLHS = N->getOperand(1).getNode();
26401 SDNode *SplatRHS = N->getOperand(2).getNode();
26402 APInt SplatLHSVal;
26403 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
26404 VT.isSimple() &&
26405 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
26406 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
26407 VT.getSimpleVT().SimpleTy) &&
26408 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
26409 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
26411 unsigned NumElts = VT.getVectorNumElements();
26413 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
26414 VT.getScalarType()));
26415 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
26416
26417 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
26418 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
26419 return Or;
26420 }
26421 }
26422
26423 EVT CmpVT = N0.getOperand(0).getValueType();
26424 if (N0.getOpcode() != ISD::SETCC ||
26426 CCVT.getVectorElementType() != MVT::i1 ||
26428 return SDValue();
26429
26430 // Only combine when the result type is of the same size as the compared
26431 // operands.
26432 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
26433 return SDValue();
26434
26435 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
26436 N0.getOperand(0), N0.getOperand(1),
26437 cast<CondCodeSDNode>(N0.getOperand(2))->get());
26438 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
26439 IfTrue, IfFalse);
26440}
26441
26442/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
26443/// the compare-mask instructions rather than going via NZCV, even if LHS and
26444/// RHS are really scalar. This replaces any scalar setcc in the above pattern
26445/// with a vector one followed by a DUP shuffle on the result.
26448 SelectionDAG &DAG = DCI.DAG;
26449 SDValue N0 = N->getOperand(0);
26450 EVT ResVT = N->getValueType(0);
26451
26452 if (N0.getOpcode() != ISD::SETCC)
26453 return SDValue();
26454
26455 if (ResVT.isScalableVT())
26456 return SDValue();
26457
26458 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
26459 // scalar SetCCResultType. We also don't expect vectors, because we assume
26460 // that selects fed by vector SETCCs are canonicalized to VSELECT.
26461 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
26462 "Scalar-SETCC feeding SELECT has unexpected result type!");
26463
26464 // If NumMaskElts == 0, the comparison is larger than select result. The
26465 // largest real NEON comparison is 64-bits per lane, which means the result is
26466 // at most 32-bits and an illegal vector. Just bail out for now.
26467 EVT SrcVT = N0.getOperand(0).getValueType();
26468
26469 // Don't try to do this optimization when the setcc itself has i1 operands.
26470 // There are no legal vectors of i1, so this would be pointless. v1f16 is
26471 // ruled out to prevent the creation of setcc that need to be scalarized.
26472 if (SrcVT == MVT::i1 ||
26473 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
26474 return SDValue();
26475
26476 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
26477 if (!ResVT.isVector() || NumMaskElts == 0)
26478 return SDValue();
26479
26480 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
26482
26483 // Also bail out if the vector CCVT isn't the same size as ResVT.
26484 // This can happen if the SETCC operand size doesn't divide the ResVT size
26485 // (e.g., f64 vs v3f32).
26486 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
26487 return SDValue();
26488
26489 // Make sure we didn't create illegal types, if we're not supposed to.
26490 assert(DCI.isBeforeLegalize() ||
26491 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
26492
26493 // First perform a vector comparison, where lane 0 is the one we're interested
26494 // in.
26495 SDLoc DL(N0);
26496 SDValue LHS =
26497 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
26498 SDValue RHS =
26499 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
26500 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
26501
26502 // Now duplicate the comparison mask we want across all other lanes.
26503 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
26504 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
26505 Mask = DAG.getNode(ISD::BITCAST, DL,
26506 ResVT.changeVectorElementTypeToInteger(), Mask);
26507
26508 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
26509}
26510
26513 EVT VT = N->getValueType(0);
26514 SDLoc DL(N);
26515 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
26516 // 128bit vector version.
26517 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
26519 SmallVector<SDValue> Ops(N->ops());
26520 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
26521 DCI.DAG.getVTList(LVT), Ops)) {
26522 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
26523 DCI.DAG.getConstant(0, DL, MVT::i64));
26524 }
26525 }
26526
26527 if (N->getOpcode() == AArch64ISD::DUP) {
26528 // If the instruction is known to produce a scalar in SIMD registers, we can
26529 // duplicate it across the vector lanes using DUPLANE instead of moving it
26530 // to a GPR first. For example, this allows us to handle:
26531 // v4i32 = DUP (i32 (FCMGT (f32, f32)))
26532 SDValue Op = N->getOperand(0);
26533 // FIXME: Ideally, we should be able to handle all instructions that
26534 // produce a scalar value in FPRs.
26535 if (Op.getOpcode() == AArch64ISD::FCMEQ ||
26536 Op.getOpcode() == AArch64ISD::FCMGE ||
26537 Op.getOpcode() == AArch64ISD::FCMGT) {
26538 EVT ElemVT = VT.getVectorElementType();
26539 EVT ExpandedVT = VT;
26540 // Insert into a 128-bit vector to match DUPLANE's pattern.
26541 if (VT.getSizeInBits() != 128)
26542 ExpandedVT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
26543 128 / ElemVT.getSizeInBits());
26544 SDValue Zero = DCI.DAG.getConstant(0, DL, MVT::i64);
26545 SDValue Vec = DCI.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpandedVT,
26546 DCI.DAG.getUNDEF(ExpandedVT), Op, Zero);
26547 return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, Vec, Zero);
26548 }
26549
26550 if (DCI.isAfterLegalizeDAG()) {
26551 // If scalar dup's operand is extract_vector_elt, try to combine them into
26552 // duplane. For example,
26553 //
26554 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
26555 // t18: v4i32 = AArch64ISD::DUP t21
26556 // ==>
26557 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
26558 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
26559 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
26560 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
26561 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
26562 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
26563 EXTRACT_VEC_ELT.getOperand(1));
26564 }
26565 }
26566 }
26567
26568 return performPostLD1Combine(N, DCI, false);
26569 }
26570
26571 return SDValue();
26572}
26573
26574/// Get rid of unnecessary NVCASTs (that don't change the type).
26576 if (N->getValueType(0) == N->getOperand(0).getValueType())
26577 return N->getOperand(0);
26578 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
26579 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
26580 N->getOperand(0).getOperand(0));
26581
26582 return SDValue();
26583}
26584
26585// If all users of the globaladdr are of the form (globaladdr + constant), find
26586// the smallest constant, fold it into the globaladdr's offset and rewrite the
26587// globaladdr as (globaladdr + constant) - constant.
26589 const AArch64Subtarget *Subtarget,
26590 const TargetMachine &TM) {
26591 auto *GN = cast<GlobalAddressSDNode>(N);
26592 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
26594 return SDValue();
26595
26596 uint64_t MinOffset = -1ull;
26597 for (SDNode *N : GN->users()) {
26598 if (N->getOpcode() != ISD::ADD)
26599 return SDValue();
26600 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
26601 if (!C)
26602 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
26603 if (!C)
26604 return SDValue();
26605 MinOffset = std::min(MinOffset, C->getZExtValue());
26606 }
26607 uint64_t Offset = MinOffset + GN->getOffset();
26608
26609 // Require that the new offset is larger than the existing one. Otherwise, we
26610 // can end up oscillating between two possible DAGs, for example,
26611 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
26612 if (Offset <= uint64_t(GN->getOffset()))
26613 return SDValue();
26614
26615 // Check whether folding this offset is legal. It must not go out of bounds of
26616 // the referenced object to avoid violating the code model, and must be
26617 // smaller than 2^20 because this is the largest offset expressible in all
26618 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
26619 // stores an immediate signed 21 bit offset.)
26620 //
26621 // This check also prevents us from folding negative offsets, which will end
26622 // up being treated in the same way as large positive ones. They could also
26623 // cause code model violations, and aren't really common enough to matter.
26624 if (Offset >= (1 << 20))
26625 return SDValue();
26626
26627 const GlobalValue *GV = GN->getGlobal();
26628 Type *T = GV->getValueType();
26629 if (!T->isSized() ||
26631 return SDValue();
26632
26633 SDLoc DL(GN);
26634 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
26635 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
26636 DAG.getConstant(MinOffset, DL, MVT::i64));
26637}
26638
26640 const AArch64Subtarget *Subtarget) {
26641 SDValue BR = N->getOperand(0);
26642 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
26644 return SDValue();
26645
26646 SDLoc DL(N);
26647 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
26648}
26649
26650// Turns the vector of indices into a vector of byte offstes by scaling Offset
26651// by (BitWidth / 8).
26653 SDLoc DL, unsigned BitWidth) {
26654 assert(Offset.getValueType().isScalableVector() &&
26655 "This method is only for scalable vectors of offsets");
26656
26657 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
26658 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
26659
26660 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
26661}
26662
26663/// Check if the value of \p OffsetInBytes can be used as an immediate for
26664/// the gather load/prefetch and scatter store instructions with vector base and
26665/// immediate offset addressing mode:
26666///
26667/// [<Zn>.[S|D]{, #<imm>}]
26668///
26669/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
26670inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
26671 unsigned ScalarSizeInBytes) {
26672 // The immediate is not a multiple of the scalar size.
26673 if (OffsetInBytes % ScalarSizeInBytes)
26674 return false;
26675
26676 // The immediate is out of range.
26677 if (OffsetInBytes / ScalarSizeInBytes > 31)
26678 return false;
26679
26680 return true;
26681}
26682
26683/// Check if the value of \p Offset represents a valid immediate for the SVE
26684/// gather load/prefetch and scatter store instructiona with vector base and
26685/// immediate offset addressing mode:
26686///
26687/// [<Zn>.[S|D]{, #<imm>}]
26688///
26689/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
26691 unsigned ScalarSizeInBytes) {
26692 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
26693 return OffsetConst && isValidImmForSVEVecImmAddrMode(
26694 OffsetConst->getZExtValue(), ScalarSizeInBytes);
26695}
26696
26698 unsigned Opcode,
26699 bool OnlyPackedOffsets = true) {
26700 const SDValue Src = N->getOperand(2);
26701 const EVT SrcVT = Src->getValueType(0);
26702 assert(SrcVT.isScalableVector() &&
26703 "Scatter stores are only possible for SVE vectors");
26704
26705 SDLoc DL(N);
26706 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
26707
26708 // Make sure that source data will fit into an SVE register
26710 return SDValue();
26711
26712 // For FPs, ACLE only supports _packed_ single and double precision types.
26713 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
26714 if (SrcElVT.isFloatingPoint())
26715 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
26716 ((Opcode != AArch64ISD::SST1Q_PRED &&
26717 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
26718 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
26719 return SDValue();
26720
26721 // Depending on the addressing mode, this is either a pointer or a vector of
26722 // pointers (that fits into one register)
26723 SDValue Base = N->getOperand(4);
26724 // Depending on the addressing mode, this is either a single offset or a
26725 // vector of offsets (that fits into one register)
26726 SDValue Offset = N->getOperand(5);
26727
26728 // For "scalar + vector of indices", just scale the indices. This only
26729 // applies to non-temporal scatters because there's no instruction that takes
26730 // indices.
26731 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
26732 Offset =
26734 Opcode = AArch64ISD::SSTNT1_PRED;
26735 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
26736 Offset =
26738 Opcode = AArch64ISD::SST1Q_PRED;
26739 }
26740
26741 // In the case of non-temporal gather loads there's only one SVE instruction
26742 // per data-size: "scalar + vector", i.e.
26743 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
26744 // Since we do have intrinsics that allow the arguments to be in a different
26745 // order, we may need to swap them to match the spec.
26746 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
26747 Offset.getValueType().isVector())
26749
26750 // SST1_IMM requires that the offset is an immediate that is:
26751 // * a multiple of #SizeInBytes,
26752 // * in the range [0, 31 x #SizeInBytes],
26753 // where #SizeInBytes is the size in bytes of the stored items. For
26754 // immediates outside that range and non-immediate scalar offsets use SST1 or
26755 // SST1_UXTW instead.
26756 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
26758 SrcVT.getScalarSizeInBits() / 8)) {
26759 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
26760 Opcode = AArch64ISD::SST1_UXTW_PRED;
26761 else
26762 Opcode = AArch64ISD::SST1_PRED;
26763
26765 }
26766 }
26767
26768 auto &TLI = DAG.getTargetLoweringInfo();
26769 if (!TLI.isTypeLegal(Base.getValueType()))
26770 return SDValue();
26771
26772 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
26773 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
26774 // nxv2i64. Legalize accordingly.
26775 if (!OnlyPackedOffsets &&
26776 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
26777 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
26778
26779 if (!TLI.isTypeLegal(Offset.getValueType()))
26780 return SDValue();
26781
26782 // Source value type that is representable in hardware
26783 EVT HwSrcVt = getSVEContainerType(SrcVT);
26784
26785 // Keep the original type of the input data to store - this is needed to be
26786 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
26787 // FP values we want the integer equivalent, so just use HwSrcVt.
26788 SDValue InputVT = DAG.getValueType(SrcVT);
26789 if (SrcVT.isFloatingPoint())
26790 InputVT = DAG.getValueType(HwSrcVt);
26791
26792 SDVTList VTs = DAG.getVTList(MVT::Other);
26793 SDValue SrcNew;
26794
26795 if (Src.getValueType().isFloatingPoint())
26796 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
26797 else
26798 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
26799
26800 SDValue Ops[] = {N->getOperand(0), // Chain
26801 SrcNew,
26802 N->getOperand(3), // Pg
26803 Base,
26804 Offset,
26805 InputVT};
26806
26807 return DAG.getNode(Opcode, DL, VTs, Ops);
26808}
26809
26811 unsigned Opcode,
26812 bool OnlyPackedOffsets = true) {
26813 const EVT RetVT = N->getValueType(0);
26814 assert(RetVT.isScalableVector() &&
26815 "Gather loads are only possible for SVE vectors");
26816
26817 SDLoc DL(N);
26818
26819 // Make sure that the loaded data will fit into an SVE register
26821 return SDValue();
26822
26823 // Depending on the addressing mode, this is either a pointer or a vector of
26824 // pointers (that fits into one register)
26825 SDValue Base = N->getOperand(3);
26826 // Depending on the addressing mode, this is either a single offset or a
26827 // vector of offsets (that fits into one register)
26828 SDValue Offset = N->getOperand(4);
26829
26830 // For "scalar + vector of indices", scale the indices to obtain unscaled
26831 // offsets. This applies to non-temporal and quadword gathers, which do not
26832 // have an addressing mode with scaled offset.
26833 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
26835 RetVT.getScalarSizeInBits());
26836 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
26837 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
26839 RetVT.getScalarSizeInBits());
26840 Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
26841 }
26842
26843 // In the case of non-temporal gather loads and quadword gather loads there's
26844 // only one addressing mode : "vector + scalar", e.g.
26845 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
26846 // Since we do have intrinsics that allow the arguments to be in a different
26847 // order, we may need to swap them to match the spec.
26848 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
26849 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
26850 Offset.getValueType().isVector())
26852
26853 // GLD{FF}1_IMM requires that the offset is an immediate that is:
26854 // * a multiple of #SizeInBytes,
26855 // * in the range [0, 31 x #SizeInBytes],
26856 // where #SizeInBytes is the size in bytes of the loaded items. For
26857 // immediates outside that range and non-immediate scalar offsets use
26858 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
26859 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
26860 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
26862 RetVT.getScalarSizeInBits() / 8)) {
26863 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
26864 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
26865 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
26866 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
26867 else
26868 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
26869 ? AArch64ISD::GLD1_MERGE_ZERO
26870 : AArch64ISD::GLDFF1_MERGE_ZERO;
26871
26873 }
26874 }
26875
26876 auto &TLI = DAG.getTargetLoweringInfo();
26877 if (!TLI.isTypeLegal(Base.getValueType()))
26878 return SDValue();
26879
26880 // Some gather load variants allow unpacked offsets, but only as nxv2i32
26881 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
26882 // nxv2i64. Legalize accordingly.
26883 if (!OnlyPackedOffsets &&
26884 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
26885 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
26886
26887 // Return value type that is representable in hardware
26888 EVT HwRetVt = getSVEContainerType(RetVT);
26889
26890 // Keep the original output value type around - this is needed to be able to
26891 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
26892 // values we want the integer equivalent, so just use HwRetVT.
26893 SDValue OutVT = DAG.getValueType(RetVT);
26894 if (RetVT.isFloatingPoint())
26895 OutVT = DAG.getValueType(HwRetVt);
26896
26897 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
26898 SDValue Ops[] = {N->getOperand(0), // Chain
26899 N->getOperand(2), // Pg
26900 Base, Offset, OutVT};
26901
26902 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
26903 SDValue LoadChain = SDValue(Load.getNode(), 1);
26904
26905 if (RetVT.isInteger() && (RetVT != HwRetVt))
26906 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
26907
26908 // If the original return value was FP, bitcast accordingly. Doing it here
26909 // means that we can avoid adding TableGen patterns for FPs.
26910 if (RetVT.isFloatingPoint())
26911 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
26912
26913 return DAG.getMergeValues({Load, LoadChain}, DL);
26914}
26915
26916static SDValue
26918 SelectionDAG &DAG) {
26919 SDLoc DL(N);
26920 SDValue Src = N->getOperand(0);
26921 unsigned Opc = Src->getOpcode();
26922
26923 // Sign extend of an unsigned unpack -> signed unpack
26924 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
26925
26926 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
26927 : AArch64ISD::SUNPKLO;
26928
26929 // Push the sign extend to the operand of the unpack
26930 // This is necessary where, for example, the operand of the unpack
26931 // is another unpack:
26932 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
26933 // ->
26934 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
26935 // ->
26936 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
26937 SDValue ExtOp = Src->getOperand(0);
26938 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
26939 EVT EltTy = VT.getVectorElementType();
26940 (void)EltTy;
26941
26942 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
26943 "Sign extending from an invalid type");
26944
26945 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
26946
26948 ExtOp, DAG.getValueType(ExtVT));
26949
26950 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
26951 }
26952
26953 // Sign extend of CSET -> CSETM.
26954 if (Opc == AArch64ISD::CSEL &&
26955 cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1) {
26956 EVT VT = N->getValueType(0);
26957 SDValue TVal = Src.getOperand(0);
26958 SDValue FVal = Src.getOperand(1);
26959
26960 // SIGN_EXTEND_INREG (CSEL 0, 1, cc, NZCV), i1 --> CSEL 0, -1, cc, NZCV
26961 if (isNullConstant(TVal) && isOneConstant(FVal))
26962 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal,
26963 DAG.getAllOnesConstant(DL, VT), Src.getOperand(2),
26964 Src.getOperand(3));
26965
26966 // SIGN_EXTEND_INREG (CSEL 1, 0, cc, NZCV), i1 --> CSEL -1, 0, cc, NZCV
26967 if (isOneConstant(TVal) && isNullConstant(FVal))
26968 return DAG.getNode(AArch64ISD::CSEL, DL, VT,
26969 DAG.getAllOnesConstant(DL, VT), FVal,
26970 Src.getOperand(2), Src.getOperand(3));
26971 }
26972
26973 if (DCI.isBeforeLegalizeOps())
26974 return SDValue();
26975
26977 return SDValue();
26978
26979 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
26980 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
26981 unsigned NewOpc;
26982 unsigned MemVTOpNum = 4;
26983 switch (Opc) {
26984 case AArch64ISD::LD1_MERGE_ZERO:
26985 NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
26986 MemVTOpNum = 3;
26987 break;
26988 case AArch64ISD::LDNF1_MERGE_ZERO:
26989 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
26990 MemVTOpNum = 3;
26991 break;
26992 case AArch64ISD::LDFF1_MERGE_ZERO:
26993 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
26994 MemVTOpNum = 3;
26995 break;
26996 case AArch64ISD::GLD1_MERGE_ZERO:
26997 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
26998 break;
26999 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
27000 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
27001 break;
27002 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
27003 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
27004 break;
27005 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
27006 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
27007 break;
27008 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
27009 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
27010 break;
27011 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
27012 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
27013 break;
27014 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
27015 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
27016 break;
27017 case AArch64ISD::GLDFF1_MERGE_ZERO:
27018 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
27019 break;
27020 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
27021 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
27022 break;
27023 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
27024 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
27025 break;
27026 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
27027 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
27028 break;
27029 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
27030 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
27031 break;
27032 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
27033 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
27034 break;
27035 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
27036 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
27037 break;
27038 case AArch64ISD::GLDNT1_MERGE_ZERO:
27039 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
27040 break;
27041 default:
27042 return SDValue();
27043 }
27044
27045 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
27046 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
27047
27048 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
27049 return SDValue();
27050
27051 EVT DstVT = N->getValueType(0);
27052 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
27053
27055 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
27056 Ops.push_back(Src->getOperand(I));
27057
27058 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
27059 DCI.CombineTo(N, ExtLoad);
27060 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
27061
27062 // Return N so it doesn't get rechecked
27063 return SDValue(N, 0);
27064}
27065
27066/// Legalize the gather prefetch (scalar + vector addressing mode) when the
27067/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
27068/// != nxv2i32) do not need legalization.
27070 const unsigned OffsetPos = 4;
27071 SDValue Offset = N->getOperand(OffsetPos);
27072
27073 // Not an unpacked vector, bail out.
27074 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
27075 return SDValue();
27076
27077 // Extend the unpacked offset vector to 64-bit lanes.
27078 SDLoc DL(N);
27079 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
27081 // Replace the offset operand with the 64-bit one.
27082 Ops[OffsetPos] = Offset;
27083
27084 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
27085}
27086
27087/// Combines a node carrying the intrinsic
27088/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
27089/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
27090/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
27091/// sve gather prefetch instruction with vector plus immediate addressing mode.
27093 unsigned ScalarSizeInBytes) {
27094 const unsigned ImmPos = 4, OffsetPos = 3;
27095 // No need to combine the node if the immediate is valid...
27096 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
27097 return SDValue();
27098
27099 // ...otherwise swap the offset base with the offset...
27101 std::swap(Ops[ImmPos], Ops[OffsetPos]);
27102 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
27103 // `aarch64_sve_prfb_gather_uxtw_index`.
27104 SDLoc DL(N);
27105 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
27106 MVT::i64);
27107
27108 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
27109}
27110
27111// Return true if the vector operation can guarantee only the first lane of its
27112// result contains data, with all bits in other lanes set to zero.
27114 switch (Op.getOpcode()) {
27115 default:
27116 return false;
27117 case AArch64ISD::ANDV_PRED:
27118 case AArch64ISD::EORV_PRED:
27119 case AArch64ISD::FADDA_PRED:
27120 case AArch64ISD::FADDV_PRED:
27121 case AArch64ISD::FMAXNMV_PRED:
27122 case AArch64ISD::FMAXV_PRED:
27123 case AArch64ISD::FMINNMV_PRED:
27124 case AArch64ISD::FMINV_PRED:
27125 case AArch64ISD::ORV_PRED:
27126 case AArch64ISD::SADDV_PRED:
27127 case AArch64ISD::SMAXV_PRED:
27128 case AArch64ISD::SMINV_PRED:
27129 case AArch64ISD::UADDV_PRED:
27130 case AArch64ISD::UMAXV_PRED:
27131 case AArch64ISD::UMINV_PRED:
27132 return true;
27133 }
27134}
27135
27137 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
27138 SDValue InsertVec = N->getOperand(0);
27139 SDValue InsertElt = N->getOperand(1);
27140 SDValue InsertIdx = N->getOperand(2);
27141
27142 // We only care about inserts into the first element...
27143 if (!isNullConstant(InsertIdx))
27144 return SDValue();
27145 // ...of a zero'd vector...
27147 return SDValue();
27148 // ...where the inserted data was previously extracted...
27149 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
27150 return SDValue();
27151
27152 SDValue ExtractVec = InsertElt.getOperand(0);
27153 SDValue ExtractIdx = InsertElt.getOperand(1);
27154
27155 // ...from the first element of a vector.
27156 if (!isNullConstant(ExtractIdx))
27157 return SDValue();
27158
27159 // If we get here we are effectively trying to zero lanes 1-N of a vector.
27160
27161 // Ensure there's no type conversion going on.
27162 if (N->getValueType(0) != ExtractVec.getValueType())
27163 return SDValue();
27164
27165 if (!isLanes1toNKnownZero(ExtractVec))
27166 return SDValue();
27167
27168 // The explicit zeroing is redundant.
27169 return ExtractVec;
27170}
27171
27172static SDValue
27175 return Res;
27176
27177 return performPostLD1Combine(N, DCI, true);
27178}
27179
27182 const AArch64Subtarget *Subtarget) {
27183 SDValue N0 = N->getOperand(0);
27184 EVT VT = N->getValueType(0);
27185
27186 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
27187 if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::FP_ROUND)
27188 return SDValue();
27189
27190 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
27191 EVT EltVT = VT.getVectorElementType();
27192 return EltVT == MVT::f32 || EltVT == MVT::f64;
27193 };
27194
27195 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
27196 // We purposefully don't care about legality of the nodes here as we know
27197 // they can be split down into something legal.
27198 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
27199 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
27200 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
27201 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
27202 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
27203 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
27204 LN0->getChain(), LN0->getBasePtr(),
27205 N0.getValueType(), LN0->getMemOperand());
27206 DCI.CombineTo(N, ExtLoad);
27207 DCI.CombineTo(
27208 N0.getNode(),
27209 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
27210 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
27211 ExtLoad.getValue(1));
27212 return SDValue(N, 0); // Return N so it doesn't get rechecked!
27213 }
27214
27215 return SDValue();
27216}
27217
27219 const AArch64Subtarget *Subtarget) {
27220 EVT VT = N->getValueType(0);
27221
27222 // Don't expand for NEON, SVE2 or SME
27223 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
27224 return SDValue();
27225
27226 SDLoc DL(N);
27227
27228 SDValue Mask = N->getOperand(0);
27229 SDValue In1 = N->getOperand(1);
27230 SDValue In2 = N->getOperand(2);
27231
27232 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
27233 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
27234 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
27235 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
27236}
27237
27239 EVT VT = N->getValueType(0);
27240
27241 SDValue Insert = N->getOperand(0);
27242 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
27243 return SDValue();
27244
27245 if (!Insert.getOperand(0).isUndef())
27246 return SDValue();
27247
27248 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
27249 uint64_t IdxDupLane = N->getConstantOperandVal(1);
27250 if (IdxInsert != 0 || IdxDupLane != 0)
27251 return SDValue();
27252
27253 SDValue Bitcast = Insert.getOperand(1);
27254 if (Bitcast.getOpcode() != ISD::BITCAST)
27255 return SDValue();
27256
27257 SDValue Subvec = Bitcast.getOperand(0);
27258 EVT SubvecVT = Subvec.getValueType();
27259 if (!SubvecVT.is128BitVector())
27260 return SDValue();
27261 EVT NewSubvecVT =
27263
27264 SDLoc DL(N);
27265 SDValue NewInsert =
27266 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
27267 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
27268 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
27269 NewInsert, N->getOperand(1));
27270 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
27271}
27272
27273// Try to combine mull with uzp1.
27276 SelectionDAG &DAG) {
27277 if (DCI.isBeforeLegalizeOps())
27278 return SDValue();
27279
27280 SDValue LHS = N->getOperand(0);
27281 SDValue RHS = N->getOperand(1);
27282
27283 SDValue ExtractHigh;
27284 SDValue ExtractLow;
27285 SDValue TruncHigh;
27286 SDValue TruncLow;
27287 SDLoc DL(N);
27288
27289 // Check the operands are trunc and extract_high.
27291 RHS.getOpcode() == ISD::TRUNCATE) {
27292 TruncHigh = RHS;
27293 if (LHS.getOpcode() == ISD::BITCAST)
27294 ExtractHigh = LHS.getOperand(0);
27295 else
27296 ExtractHigh = LHS;
27298 LHS.getOpcode() == ISD::TRUNCATE) {
27299 TruncHigh = LHS;
27300 if (RHS.getOpcode() == ISD::BITCAST)
27301 ExtractHigh = RHS.getOperand(0);
27302 else
27303 ExtractHigh = RHS;
27304 } else
27305 return SDValue();
27306
27307 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
27308 // with uzp1.
27309 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
27310 SDValue TruncHighOp = TruncHigh.getOperand(0);
27311 EVT TruncHighOpVT = TruncHighOp.getValueType();
27312 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
27313 DAG.isSplatValue(TruncHighOp, false))
27314 return SDValue();
27315
27316 // Check there is other extract_high with same source vector.
27317 // For example,
27318 //
27319 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
27320 // t12: v4i16 = truncate t11
27321 // t31: v4i32 = AArch64ISD::SMULL t18, t12
27322 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
27323 // t16: v4i16 = truncate t15
27324 // t30: v4i32 = AArch64ISD::SMULL t23, t1
27325 //
27326 // This dagcombine assumes the two extract_high uses same source vector in
27327 // order to detect the pair of the mull. If they have different source vector,
27328 // this code will not work.
27329 // TODO: Should also try to look through a bitcast.
27330 bool HasFoundMULLow = true;
27331 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
27332 if (ExtractHighSrcVec->use_size() != 2)
27333 HasFoundMULLow = false;
27334
27335 // Find ExtractLow.
27336 for (SDNode *User : ExtractHighSrcVec.getNode()->users()) {
27337 if (User == ExtractHigh.getNode())
27338 continue;
27339
27340 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
27342 HasFoundMULLow = false;
27343 break;
27344 }
27345
27346 ExtractLow.setNode(User);
27347 }
27348
27349 if (!ExtractLow || !ExtractLow->hasOneUse())
27350 HasFoundMULLow = false;
27351
27352 // Check ExtractLow's user.
27353 if (HasFoundMULLow) {
27354 SDNode *ExtractLowUser = *ExtractLow.getNode()->user_begin();
27355 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
27356 HasFoundMULLow = false;
27357 } else {
27358 if (ExtractLowUser->getOperand(0) == ExtractLow) {
27359 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
27360 TruncLow = ExtractLowUser->getOperand(1);
27361 else
27362 HasFoundMULLow = false;
27363 } else {
27364 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
27365 TruncLow = ExtractLowUser->getOperand(0);
27366 else
27367 HasFoundMULLow = false;
27368 }
27369 }
27370 }
27371
27372 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
27373 // with uzp1.
27374 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
27375 EVT TruncHighVT = TruncHigh.getValueType();
27376 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
27377 SDValue TruncLowOp =
27378 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
27379 EVT TruncLowOpVT = TruncLowOp.getValueType();
27380 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
27381 DAG.isSplatValue(TruncLowOp, false)))
27382 return SDValue();
27383
27384 // Create uzp1, extract_high and extract_low.
27385 if (TruncHighOpVT != UZP1VT)
27386 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
27387 if (TruncLowOpVT != UZP1VT)
27388 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
27389
27390 SDValue UZP1 =
27391 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
27392 SDValue HighIdxCst =
27393 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
27394 SDValue NewTruncHigh =
27395 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
27396 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
27397
27398 if (HasFoundMULLow) {
27399 EVT TruncLowVT = TruncLow.getValueType();
27400 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
27401 UZP1, ExtractLow.getOperand(1));
27402 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
27403 }
27404
27405 return SDValue(N, 0);
27406}
27407
27410 SelectionDAG &DAG) {
27411 if (SDValue Val =
27413 return Val;
27414
27415 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
27416 return Val;
27417
27418 return SDValue();
27419}
27420
27421static SDValue
27423 SelectionDAG &DAG) {
27424 // Let's do below transform.
27425 //
27426 // t34: v4i32 = AArch64ISD::UADDLV t2
27427 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
27428 // t7: i64 = zero_extend t35
27429 // t20: v1i64 = scalar_to_vector t7
27430 // ==>
27431 // t34: v4i32 = AArch64ISD::UADDLV t2
27432 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
27433 // t40: v1i64 = AArch64ISD::NVCAST t39
27434 if (DCI.isBeforeLegalizeOps())
27435 return SDValue();
27436
27437 EVT VT = N->getValueType(0);
27438 if (VT != MVT::v1i64)
27439 return SDValue();
27440
27441 SDValue ZEXT = N->getOperand(0);
27442 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
27443 return SDValue();
27444
27445 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
27446 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
27447 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
27448 return SDValue();
27449
27450 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
27451 return SDValue();
27452
27453 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
27454 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
27455 UADDLV.getValueType() != MVT::v4i32 ||
27456 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
27457 return SDValue();
27458
27459 // Let's generate new sequence with AArch64ISD::NVCAST.
27460 SDLoc DL(N);
27461 SDValue EXTRACT_SUBVEC =
27462 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
27463 DAG.getConstant(0, DL, MVT::i64));
27464 SDValue NVCAST =
27465 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
27466
27467 return NVCAST;
27468}
27469
27472 if (!DCI.isBeforeLegalize())
27473 return SDValue();
27474
27475 unsigned NumParts = N->getNumOperands();
27476 if (NumParts != 2 && NumParts != 4)
27477 return SDValue();
27478
27479 EVT SubVecTy = N->getValueType(0);
27480
27481 // At the moment we're unlikely to see a fixed-width vector deinterleave as
27482 // we usually generate shuffles instead.
27483 unsigned MinNumElements = SubVecTy.getVectorMinNumElements();
27484 if (!SubVecTy.isScalableVector() ||
27485 SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
27486 !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
27487 return SDValue();
27488
27489 // Make sure each input operand is the correct extract_subvector of the same
27490 // wider vector.
27491 SDValue Op0 = N->getOperand(0);
27492 for (unsigned I = 0; I < NumParts; I++) {
27493 SDValue OpI = N->getOperand(I);
27494 if (OpI->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
27495 OpI->getOperand(0) != Op0->getOperand(0))
27496 return SDValue();
27497 if (OpI->getConstantOperandVal(1) != (I * MinNumElements))
27498 return SDValue();
27499 }
27500
27501 // Normal loads are currently already handled by the InterleavedAccessPass so
27502 // we don't expect to see them here. Bail out if the masked load has an
27503 // unexpected number of uses, since we want to avoid a situation where we have
27504 // both deinterleaving loads and normal loads in the same block. Also, discard
27505 // masked loads that are extending, indexed, have an unexpected offset or have
27506 // an unsupported passthru value until we find a valid use case.
27507 auto MaskedLoad = dyn_cast<MaskedLoadSDNode>(Op0->getOperand(0));
27508 if (!MaskedLoad || !MaskedLoad->hasNUsesOfValue(NumParts, 0) ||
27509 !MaskedLoad->isSimple() || !ISD::isNormalMaskedLoad(MaskedLoad) ||
27510 !MaskedLoad->getOffset().isUndef() ||
27511 (!MaskedLoad->getPassThru()->isUndef() &&
27512 !isZerosVector(MaskedLoad->getPassThru().getNode())))
27513 return SDValue();
27514
27515 // Now prove that the mask is an interleave of identical masks.
27516 SDLoc DL(N);
27517 SDValue NarrowMask =
27518 getNarrowMaskForInterleavedOps(DAG, DL, MaskedLoad->getMask(), NumParts);
27519 if (!NarrowMask)
27520 return SDValue();
27521
27522 const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret
27523 : Intrinsic::aarch64_sve_ld4_sret;
27524 SDValue NewLdOps[] = {MaskedLoad->getChain(),
27525 DAG.getConstant(IID, DL, MVT::i32), NarrowMask,
27526 MaskedLoad->getBasePtr()};
27527 SDValue Res;
27528 if (NumParts == 2)
27530 {SubVecTy, SubVecTy, MVT::Other}, NewLdOps);
27531 else
27533 {SubVecTy, SubVecTy, SubVecTy, SubVecTy, MVT::Other},
27534 NewLdOps);
27535
27536 // We can now generate a structured load!
27537 SmallVector<SDValue, 4> ResOps(NumParts);
27538 for (unsigned Idx = 0; Idx < NumParts; Idx++)
27539 ResOps[Idx] = SDValue(Res.getNode(), Idx);
27540
27541 // Replace uses of the original chain result with the new chain result.
27542 DAG.ReplaceAllUsesOfValueWith(SDValue(MaskedLoad, 1),
27543 SDValue(Res.getNode(), NumParts));
27544 return DCI.CombineTo(N, ResOps, false);
27545}
27546
27547/// If the operand is a bitwise AND with a constant RHS, and the shift has a
27548/// constant RHS and is the only use, we can pull it out of the shift, i.e.
27549///
27550/// (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
27551///
27552/// We prefer this canonical form to match existing isel patterns.
27555 SelectionDAG &DAG) {
27556 if (DCI.isBeforeLegalizeOps())
27557 return SDValue();
27558
27559 SDValue Op0 = N->getOperand(0);
27560 if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
27561 return SDValue();
27562
27563 SDValue C1 = Op0->getOperand(1);
27564 SDValue C2 = N->getOperand(1);
27566 return SDValue();
27567
27568 // Might be folded into shifted op, do not lower.
27569 if (N->hasOneUse()) {
27570 unsigned UseOpc = N->user_begin()->getOpcode();
27571 if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
27572 UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
27573 return SDValue();
27574 }
27575
27576 SDLoc DL(N);
27577 EVT VT = N->getValueType(0);
27578
27579 // Don't combine unless (shl C1, C2) can be constant folded. Otherwise,
27580 // DAGCombiner will simplify (and (op x...), (op y...)) -> (op (and x, y))
27581 // causing infinite loop. Result may also be worse.
27582 SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
27583 if (!isa<ConstantSDNode>(NewRHS))
27584 return SDValue();
27585
27586 SDValue X = Op0->getOperand(0);
27587 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
27588 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
27589}
27590
27592 unsigned IntrinsicID = N->getConstantOperandVal(1);
27593 auto Register =
27594 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
27595 : AArch64SysReg::RNDRRS);
27596 SDLoc DL(N);
27597 SDValue A = DAG.getNode(
27598 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
27599 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
27600 SDValue B = DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
27601 DAG.getConstant(0, DL, MVT::i32),
27602 DAG.getConstant(0, DL, MVT::i32),
27603 getCondCode(DAG, AArch64CC::NE), A.getValue(1));
27604 return DAG.getMergeValues(
27605 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
27606}
27607
27609 DAGCombinerInfo &DCI) const {
27610 SelectionDAG &DAG = DCI.DAG;
27611 switch (N->getOpcode()) {
27612 default:
27613 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
27614 break;
27616 return performVectorDeinterleaveCombine(N, DCI, DAG);
27617 case ISD::VECREDUCE_AND:
27618 case ISD::VECREDUCE_OR:
27619 case ISD::VECREDUCE_XOR:
27620 return performVecReduceBitwiseCombine(N, DCI, DAG);
27621 case ISD::ADD:
27622 case ISD::SUB:
27623 return performAddSubCombine(N, DCI);
27624 case ISD::BUILD_VECTOR:
27625 return performBuildVectorCombine(N, DCI, DAG);
27626 case ISD::SMIN:
27627 return performSMINCombine(N, DAG);
27628 case ISD::TRUNCATE:
27629 return performTruncateCombine(N, DAG, DCI);
27630 case AArch64ISD::ANDS:
27631 return performFlagSettingCombine(N, DCI, ISD::AND);
27632 case AArch64ISD::ADC:
27633 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
27634 return R;
27635 return foldADCToCINC(N, DAG);
27636 case AArch64ISD::SBC:
27637 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
27638 case AArch64ISD::ADCS:
27639 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
27640 return R;
27641 return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
27642 case AArch64ISD::SBCS:
27643 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
27644 return R;
27645 return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
27646 case AArch64ISD::ADDS:
27647 return performFlagSettingCombine(N, DCI, ISD::ADD);
27648 case AArch64ISD::SUBS:
27649 return performFlagSettingCombine(N, DCI, ISD::SUB);
27650 case AArch64ISD::BICi: {
27652 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
27653 APInt DemandedElts =
27654 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
27655
27657 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
27658 return SDValue();
27659
27660 break;
27661 }
27662 case ISD::XOR:
27663 return performXorCombine(N, DAG, DCI, Subtarget);
27664 case ISD::MUL:
27665 return performMulCombine(N, DAG, DCI, Subtarget);
27666 case ISD::SINT_TO_FP:
27667 case ISD::UINT_TO_FP:
27668 return performIntToFpCombine(N, DAG, DCI, Subtarget);
27669 case ISD::FP_TO_SINT:
27670 case ISD::FP_TO_UINT:
27673 return performFpToIntCombine(N, DAG, DCI, Subtarget);
27674 case ISD::OR:
27675 return performORCombine(N, DCI, Subtarget, *this);
27676 case ISD::AND:
27677 return performANDCombine(N, DCI);
27678 case ISD::FADD:
27679 return performFADDCombine(N, DCI);
27681 return performIntrinsicCombine(N, DCI, Subtarget);
27682 case ISD::ANY_EXTEND:
27683 case ISD::ZERO_EXTEND:
27684 case ISD::SIGN_EXTEND:
27685 return performExtendCombine(N, DCI, DAG);
27687 return performSignExtendInRegCombine(N, DCI, DAG);
27689 return performConcatVectorsCombine(N, DCI, DAG);
27691 return performExtractSubvectorCombine(N, DCI, DAG);
27693 return performInsertSubvectorCombine(N, DCI, DAG);
27694 case ISD::SELECT:
27695 return performSelectCombine(N, DCI);
27696 case ISD::VSELECT:
27697 return performVSelectCombine(N, DCI.DAG);
27698 case ISD::SETCC:
27699 return performSETCCCombine(N, DCI, DAG);
27700 case ISD::LOAD:
27701 return performLOADCombine(N, DCI, DAG, Subtarget);
27702 case ISD::STORE:
27703 return performSTORECombine(N, DCI, DAG, Subtarget);
27704 case ISD::MSTORE:
27705 return performMSTORECombine(N, DCI, DAG, Subtarget);
27706 case ISD::MGATHER:
27707 case ISD::MSCATTER:
27708 case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
27709 return performMaskedGatherScatterCombine(N, DCI, DAG);
27710 case ISD::FP_EXTEND:
27711 return performFPExtendCombine(N, DAG, DCI, Subtarget);
27712 case AArch64ISD::BRCOND:
27713 return performBRCONDCombine(N, DCI, DAG);
27714 case AArch64ISD::TBNZ:
27715 case AArch64ISD::TBZ:
27716 return performTBZCombine(N, DCI, DAG);
27717 case AArch64ISD::CSEL:
27718 return performCSELCombine(N, DCI, DAG);
27719 case AArch64ISD::DUP:
27720 case AArch64ISD::DUPLANE8:
27721 case AArch64ISD::DUPLANE16:
27722 case AArch64ISD::DUPLANE32:
27723 case AArch64ISD::DUPLANE64:
27724 return performDUPCombine(N, DCI);
27725 case AArch64ISD::DUPLANE128:
27726 return performDupLane128Combine(N, DAG);
27727 case AArch64ISD::NVCAST:
27728 return performNVCASTCombine(N, DAG);
27729 case AArch64ISD::SPLICE:
27730 return performSpliceCombine(N, DAG);
27731 case AArch64ISD::UUNPKLO:
27732 case AArch64ISD::UUNPKHI:
27733 return performUnpackCombine(N, DAG, Subtarget);
27734 case AArch64ISD::UZP1:
27735 case AArch64ISD::UZP2:
27736 return performUzpCombine(N, DAG, Subtarget);
27737 case AArch64ISD::SETCC_MERGE_ZERO:
27738 return performSetccMergeZeroCombine(N, DCI);
27739 case AArch64ISD::REINTERPRET_CAST:
27741 case AArch64ISD::GLD1_MERGE_ZERO:
27742 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
27743 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
27744 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
27745 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
27746 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
27747 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
27748 case AArch64ISD::GLD1S_MERGE_ZERO:
27749 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
27750 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
27751 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
27752 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
27753 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
27754 case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
27755 return performGLD1Combine(N, DAG);
27756 case AArch64ISD::VASHR:
27757 case AArch64ISD::VLSHR:
27758 return performVectorShiftCombine(N, *this, DCI);
27759 case AArch64ISD::SUNPKLO:
27760 return performSunpkloCombine(N, DAG);
27761 case AArch64ISD::BSP:
27762 return performBSPExpandForSVE(N, DAG, Subtarget);
27764 return performInsertVectorEltCombine(N, DCI);
27766 return performExtractVectorEltCombine(N, DCI, Subtarget);
27767 case ISD::VECREDUCE_ADD:
27768 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
27769 case ISD::GET_ACTIVE_LANE_MASK:
27770 return performActiveLaneMaskCombine(N, DCI, Subtarget);
27771 case AArch64ISD::UADDV:
27772 return performUADDVCombine(N, DAG);
27773 case AArch64ISD::SMULL:
27774 case AArch64ISD::UMULL:
27775 case AArch64ISD::PMULL:
27776 return performMULLCombine(N, DCI, DAG);
27779 switch (N->getConstantOperandVal(1)) {
27780 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
27781 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
27782 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
27783 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
27784 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
27785 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
27786 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
27787 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
27788 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
27789 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
27790 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
27791 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
27792 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
27793 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
27794 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
27795 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
27797 case Intrinsic::aarch64_neon_ld2:
27798 case Intrinsic::aarch64_neon_ld3:
27799 case Intrinsic::aarch64_neon_ld4:
27800 case Intrinsic::aarch64_neon_ld1x2:
27801 case Intrinsic::aarch64_neon_ld1x3:
27802 case Intrinsic::aarch64_neon_ld1x4:
27803 case Intrinsic::aarch64_neon_ld2lane:
27804 case Intrinsic::aarch64_neon_ld3lane:
27805 case Intrinsic::aarch64_neon_ld4lane:
27806 case Intrinsic::aarch64_neon_ld2r:
27807 case Intrinsic::aarch64_neon_ld3r:
27808 case Intrinsic::aarch64_neon_ld4r:
27809 case Intrinsic::aarch64_neon_st2:
27810 case Intrinsic::aarch64_neon_st3:
27811 case Intrinsic::aarch64_neon_st4:
27812 case Intrinsic::aarch64_neon_st1x2:
27813 case Intrinsic::aarch64_neon_st1x3:
27814 case Intrinsic::aarch64_neon_st1x4:
27815 case Intrinsic::aarch64_neon_st2lane:
27816 case Intrinsic::aarch64_neon_st3lane:
27817 case Intrinsic::aarch64_neon_st4lane:
27818 return performNEONPostLDSTCombine(N, DCI, DAG);
27819 case Intrinsic::aarch64_sve_ldnt1:
27820 return performLDNT1Combine(N, DAG);
27821 case Intrinsic::aarch64_sve_ld1rq:
27823 case Intrinsic::aarch64_sve_ld1ro:
27825 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
27826 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
27827 case Intrinsic::aarch64_sve_ldnt1_gather:
27828 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
27829 case Intrinsic::aarch64_sve_ldnt1_gather_index:
27830 return performGatherLoadCombine(N, DAG,
27831 AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
27832 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
27833 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
27834 case Intrinsic::aarch64_sve_ld1:
27835 return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
27836 case Intrinsic::aarch64_sve_ldnf1:
27837 return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
27838 case Intrinsic::aarch64_sve_ldff1:
27839 return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
27840 case Intrinsic::aarch64_sve_st1:
27841 return performST1Combine(N, DAG);
27842 case Intrinsic::aarch64_sve_stnt1:
27843 return performSTNT1Combine(N, DAG);
27844 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
27845 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
27846 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
27847 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
27848 case Intrinsic::aarch64_sve_stnt1_scatter:
27849 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
27850 case Intrinsic::aarch64_sve_stnt1_scatter_index:
27851 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
27852 case Intrinsic::aarch64_sve_ld1_gather:
27853 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
27854 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
27855 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
27856 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
27857 case Intrinsic::aarch64_sve_ld1q_gather_index:
27858 return performGatherLoadCombine(N, DAG,
27859 AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
27860 case Intrinsic::aarch64_sve_ld1_gather_index:
27861 return performGatherLoadCombine(N, DAG,
27862 AArch64ISD::GLD1_SCALED_MERGE_ZERO);
27863 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
27864 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
27865 /*OnlyPackedOffsets=*/false);
27866 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
27867 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
27868 /*OnlyPackedOffsets=*/false);
27869 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
27870 return performGatherLoadCombine(N, DAG,
27871 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
27872 /*OnlyPackedOffsets=*/false);
27873 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
27874 return performGatherLoadCombine(N, DAG,
27875 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
27876 /*OnlyPackedOffsets=*/false);
27877 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
27878 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
27879 case Intrinsic::aarch64_sve_ldff1_gather:
27880 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
27881 case Intrinsic::aarch64_sve_ldff1_gather_index:
27882 return performGatherLoadCombine(N, DAG,
27883 AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
27884 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
27885 return performGatherLoadCombine(N, DAG,
27886 AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
27887 /*OnlyPackedOffsets=*/false);
27888 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
27889 return performGatherLoadCombine(N, DAG,
27890 AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
27891 /*OnlyPackedOffsets=*/false);
27892 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
27893 return performGatherLoadCombine(N, DAG,
27894 AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
27895 /*OnlyPackedOffsets=*/false);
27896 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
27897 return performGatherLoadCombine(N, DAG,
27898 AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
27899 /*OnlyPackedOffsets=*/false);
27900 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
27901 return performGatherLoadCombine(N, DAG,
27902 AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
27903 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
27904 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
27905 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
27906 case Intrinsic::aarch64_sve_st1q_scatter_index:
27907 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
27908 case Intrinsic::aarch64_sve_st1_scatter:
27909 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
27910 case Intrinsic::aarch64_sve_st1_scatter_index:
27911 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
27912 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
27913 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
27914 /*OnlyPackedOffsets=*/false);
27915 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
27916 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
27917 /*OnlyPackedOffsets=*/false);
27918 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
27919 return performScatterStoreCombine(N, DAG,
27920 AArch64ISD::SST1_SXTW_SCALED_PRED,
27921 /*OnlyPackedOffsets=*/false);
27922 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
27923 return performScatterStoreCombine(N, DAG,
27924 AArch64ISD::SST1_UXTW_SCALED_PRED,
27925 /*OnlyPackedOffsets=*/false);
27926 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
27927 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
27928 case Intrinsic::aarch64_rndr:
27929 case Intrinsic::aarch64_rndrrs:
27930 return performRNDRCombine(N, DAG);
27931 case Intrinsic::aarch64_sme_ldr_zt:
27932 return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),
27933 DAG.getVTList(MVT::Other), N->getOperand(0),
27934 N->getOperand(2), N->getOperand(3));
27935 case Intrinsic::aarch64_sme_str_zt:
27936 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
27937 DAG.getVTList(MVT::Other), N->getOperand(0),
27938 N->getOperand(2), N->getOperand(3));
27939 default:
27940 break;
27941 }
27942 break;
27943 case ISD::GlobalAddress:
27944 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
27945 case ISD::CTLZ:
27946 return performCTLZCombine(N, DAG, Subtarget);
27948 return performScalarToVectorCombine(N, DCI, DAG);
27949 case ISD::SHL:
27950 return performSHLCombine(N, DCI, DAG);
27951 }
27952 return SDValue();
27953}
27954
27955// Check if the return value is used as only a return value, as otherwise
27956// we can't perform a tail-call. In particular, we need to check for
27957// target ISD nodes that are returns and any other "odd" constructs
27958// that the generic analysis code won't necessarily catch.
27959bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
27960 SDValue &Chain) const {
27961 if (N->getNumValues() != 1)
27962 return false;
27963 if (!N->hasNUsesOfValue(1, 0))
27964 return false;
27965
27966 SDValue TCChain = Chain;
27967 SDNode *Copy = *N->user_begin();
27968 if (Copy->getOpcode() == ISD::CopyToReg) {
27969 // If the copy has a glue operand, we conservatively assume it isn't safe to
27970 // perform a tail call.
27971 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
27972 MVT::Glue)
27973 return false;
27974 TCChain = Copy->getOperand(0);
27975 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
27976 return false;
27977
27978 bool HasRet = false;
27979 for (SDNode *Node : Copy->users()) {
27980 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
27981 return false;
27982 HasRet = true;
27983 }
27984
27985 if (!HasRet)
27986 return false;
27987
27988 Chain = TCChain;
27989 return true;
27990}
27991
27992// Return whether the an instruction can potentially be optimized to a tail
27993// call. This will cause the optimizers to attempt to move, or duplicate,
27994// return instructions to help enable tail call optimizations for this
27995// instruction.
27996bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
27997 return CI->isTailCall();
27998}
27999
28000bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
28001 Register Offset, bool IsPre,
28002 MachineRegisterInfo &MRI) const {
28003 auto CstOffset = getIConstantVRegVal(Offset, MRI);
28004 if (!CstOffset || CstOffset->isZero())
28005 return false;
28006
28007 // All of the indexed addressing mode instructions take a signed 9 bit
28008 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
28009 // encodes the sign/indexing direction.
28010 return isInt<9>(CstOffset->getSExtValue());
28011}
28012
28013bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
28014 SDValue &Base,
28015 SDValue &Offset,
28016 SelectionDAG &DAG) const {
28017 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
28018 return false;
28019
28020 // Non-null if there is exactly one user of the loaded value (ignoring chain).
28021 SDNode *ValOnlyUser = nullptr;
28022 for (SDUse &U : N->uses()) {
28023 if (U.getResNo() == 1)
28024 continue; // Ignore chain.
28025 if (ValOnlyUser == nullptr)
28026 ValOnlyUser = U.getUser();
28027 else {
28028 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
28029 break;
28030 }
28031 }
28032
28033 auto IsUndefOrZero = [](SDValue V) {
28034 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
28035 };
28036
28037 // If the only user of the value is a scalable vector splat, it is
28038 // preferable to do a replicating load (ld1r*).
28039 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
28040 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
28041 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
28042 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
28043 return false;
28044
28045 Base = Op->getOperand(0);
28046 // All of the indexed addressing mode instructions take a signed
28047 // 9 bit immediate offset.
28048 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
28049 int64_t RHSC = RHS->getSExtValue();
28050 if (Op->getOpcode() == ISD::SUB)
28051 RHSC = -(uint64_t)RHSC;
28052 if (!isInt<9>(RHSC))
28053 return false;
28054 // When big-endian VLD1/VST1 are used for vector load and store, and these
28055 // only allow an offset that's equal to the store size.
28056 EVT MemType = cast<MemSDNode>(N)->getMemoryVT();
28057 if (!Subtarget->isLittleEndian() && MemType.isVector() &&
28058 (uint64_t)RHSC != MemType.getStoreSize())
28059 return false;
28060 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
28061 // when dealing with subtraction.
28062 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
28063 return true;
28064 }
28065 return false;
28066}
28067
28068bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
28069 SDValue &Offset,
28071 SelectionDAG &DAG) const {
28072 EVT VT;
28073 SDValue Ptr;
28074 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
28075 VT = LD->getMemoryVT();
28076 Ptr = LD->getBasePtr();
28077 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
28078 VT = ST->getMemoryVT();
28079 Ptr = ST->getBasePtr();
28080 } else
28081 return false;
28082
28083 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
28084 return false;
28085 AM = ISD::PRE_INC;
28086 return true;
28087}
28088
28089bool AArch64TargetLowering::getPostIndexedAddressParts(
28091 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
28092 EVT VT;
28093 SDValue Ptr;
28094 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
28095 VT = LD->getMemoryVT();
28096 Ptr = LD->getBasePtr();
28097 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
28098 VT = ST->getMemoryVT();
28099 Ptr = ST->getBasePtr();
28100 } else
28101 return false;
28102
28103 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
28104 return false;
28105 // Post-indexing updates the base, so it's not a valid transform
28106 // if that's not the same as the load's pointer.
28107 if (Ptr != Base)
28108 return false;
28109 AM = ISD::POST_INC;
28110 return true;
28111}
28112
28115 SelectionDAG &DAG) {
28116 SDLoc DL(N);
28117 SDValue Op = N->getOperand(0);
28118 EVT VT = N->getValueType(0);
28119 [[maybe_unused]] EVT SrcVT = Op.getValueType();
28120 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
28121 "Must be bool vector.");
28122
28123 // Special handling for Clang's __builtin_convertvector. For vectors with <8
28124 // elements, it adds a vector concatenation with undef(s). If we encounter
28125 // this here, we can skip the concat.
28126 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
28127 bool AllUndef = true;
28128 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
28129 AllUndef &= Op.getOperand(I).isUndef();
28130
28131 if (AllUndef)
28132 Op = Op.getOperand(0);
28133 }
28134
28135 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
28136 if (VectorBits)
28137 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
28138}
28139
28142 SelectionDAG &DAG, EVT ExtendVT,
28143 EVT CastVT) {
28144 SDLoc DL(N);
28145 SDValue Op = N->getOperand(0);
28146 EVT VT = N->getValueType(0);
28147
28148 // Use SCALAR_TO_VECTOR for lane zero
28149 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
28150 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
28151 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
28152 Results.push_back(
28153 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
28154}
28155
28156void AArch64TargetLowering::ReplaceBITCASTResults(
28158 SDLoc DL(N);
28159 SDValue Op = N->getOperand(0);
28160 EVT VT = N->getValueType(0);
28161 EVT SrcVT = Op.getValueType();
28162
28163 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
28164 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
28165 return;
28166 }
28167
28168 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
28169 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
28170 return;
28171 }
28172
28173 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
28174 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
28175 return;
28176 }
28177
28178 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
28179 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
28180 "Expected fp->int bitcast!");
28181
28182 // Bitcasting between unpacked vector types of different element counts is
28183 // not a NOP because the live elements are laid out differently.
28184 // 01234567
28185 // e.g. nxv2i32 = XX??XX??
28186 // nxv4f16 = X?X?X?X?
28187 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
28188 return;
28189
28190 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
28191 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
28192 return;
28193 }
28194
28195 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
28196 !VT.isVector())
28197 return replaceBoolVectorBitcast(N, Results, DAG);
28198
28199 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
28200 return;
28201
28202 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
28203 DAG.getUNDEF(MVT::i32), Op);
28204 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
28205 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
28206}
28207
28209 SelectionDAG &DAG,
28210 const AArch64Subtarget *Subtarget) {
28211 EVT VT = N->getValueType(0);
28212 if (!VT.is256BitVector() ||
28214 !N->getFlags().hasAllowReassociation()) ||
28215 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
28216 VT.getScalarType() == MVT::bf16)
28217 return;
28218
28219 SDValue X = N->getOperand(0);
28220 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
28221 if (!Shuf) {
28222 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
28223 X = N->getOperand(1);
28224 if (!Shuf)
28225 return;
28226 }
28227
28228 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
28229 return;
28230
28231 // Check the mask is 1,0,3,2,5,4,...
28232 ArrayRef<int> Mask = Shuf->getMask();
28233 for (int I = 0, E = Mask.size(); I < E; I++)
28234 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
28235 return;
28236
28237 SDLoc DL(N);
28238 auto LoHi = DAG.SplitVector(X, DL);
28239 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
28240 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
28241 LoHi.first, LoHi.second);
28242
28243 // Shuffle the elements back into order.
28244 SmallVector<int> NMask;
28245 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
28246 NMask.push_back(I);
28247 NMask.push_back(I);
28248 }
28249 Results.push_back(
28250 DAG.getVectorShuffle(VT, DL,
28251 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
28252 DAG.getUNDEF(LoHi.first.getValueType())),
28253 DAG.getUNDEF(VT), NMask));
28254}
28255
28258 SelectionDAG &DAG, unsigned InterOp,
28259 unsigned AcrossOp) {
28260 EVT LoVT, HiVT;
28261 SDValue Lo, Hi;
28262 SDLoc DL(N);
28263 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
28264 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
28265 SDValue InterVal = DAG.getNode(InterOp, DL, LoVT, Lo, Hi);
28266 SDValue SplitVal = DAG.getNode(AcrossOp, DL, LoVT, InterVal);
28267 Results.push_back(SplitVal);
28268}
28269
28270void AArch64TargetLowering::ReplaceExtractSubVectorResults(
28272 SDValue In = N->getOperand(0);
28273 EVT InVT = In.getValueType();
28274
28275 // Common code will handle these just fine.
28276 if (!InVT.isScalableVector() || !InVT.isInteger())
28277 return;
28278
28279 SDLoc DL(N);
28280 EVT VT = N->getValueType(0);
28281
28282 // The following checks bail if this is not a halving operation.
28283
28284 ElementCount ResEC = VT.getVectorElementCount();
28285
28286 if (InVT.getVectorElementCount() != (ResEC * 2))
28287 return;
28288
28289 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
28290 if (!CIndex)
28291 return;
28292
28293 unsigned Index = CIndex->getZExtValue();
28294 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
28295 return;
28296
28297 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
28298 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
28299
28300 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
28301 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
28302}
28303
28304void AArch64TargetLowering::ReplaceGetActiveLaneMaskResults(
28306 assert((Subtarget->hasSVE2p1() ||
28307 (Subtarget->hasSME2() && Subtarget->isStreaming())) &&
28308 "Custom lower of get.active.lane.mask missing required feature.");
28309
28310 assert(N->getValueType(0) == MVT::nxv32i1 &&
28311 "Unexpected result type for get.active.lane.mask");
28312
28313 SDLoc DL(N);
28314 SDValue Idx = N->getOperand(0);
28315 SDValue TC = N->getOperand(1);
28316
28317 assert(Idx.getValueType().getFixedSizeInBits() <= 64 &&
28318 "Unexpected operand type for get.active.lane.mask");
28319
28320 if (Idx.getValueType() != MVT::i64) {
28321 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
28322 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
28323 }
28324
28325 SDValue ID =
28326 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
28327 EVT HalfVT = N->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
28328 auto WideMask =
28329 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {HalfVT, HalfVT}, {ID, Idx, TC});
28330
28331 Results.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0),
28332 {WideMask.getValue(0), WideMask.getValue(1)}));
28333}
28334
28335// Create an even/odd pair of X registers holding integer value V.
28337 SDLoc DL(V.getNode());
28338 auto [VLo, VHi] = DAG.SplitScalar(V, DL, MVT::i64, MVT::i64);
28339 if (DAG.getDataLayout().isBigEndian())
28340 std::swap (VLo, VHi);
28341 SDValue RegClass =
28342 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, DL, MVT::i32);
28343 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, DL, MVT::i32);
28344 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, DL, MVT::i32);
28345 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
28346 return SDValue(
28347 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops), 0);
28348}
28349
28352 SelectionDAG &DAG,
28353 const AArch64Subtarget *Subtarget) {
28354 assert(N->getValueType(0) == MVT::i128 &&
28355 "AtomicCmpSwap on types less than 128 should be legal");
28356
28357 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
28358 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
28359 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
28360 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
28361 SDValue Ops[] = {
28362 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
28363 createGPRPairNode(DAG, N->getOperand(3)), // Store value
28364 N->getOperand(1), // Ptr
28365 N->getOperand(0), // Chain in
28366 };
28367
28368 unsigned Opcode;
28369 switch (MemOp->getMergedOrdering()) {
28371 Opcode = AArch64::CASPX;
28372 break;
28374 Opcode = AArch64::CASPAX;
28375 break;
28377 Opcode = AArch64::CASPLX;
28378 break;
28381 Opcode = AArch64::CASPALX;
28382 break;
28383 default:
28384 llvm_unreachable("Unexpected ordering!");
28385 }
28386
28387 MachineSDNode *CmpSwap = DAG.getMachineNode(
28388 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
28389 DAG.setNodeMemRefs(CmpSwap, {MemOp});
28390
28391 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
28392 if (DAG.getDataLayout().isBigEndian())
28393 std::swap(SubReg1, SubReg2);
28394 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
28395 SDValue(CmpSwap, 0));
28396 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
28397 SDValue(CmpSwap, 0));
28398 Results.push_back(
28399 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
28400 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
28401 return;
28402 }
28403
28404 unsigned Opcode;
28405 switch (MemOp->getMergedOrdering()) {
28407 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
28408 break;
28410 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
28411 break;
28413 Opcode = AArch64::CMP_SWAP_128_RELEASE;
28414 break;
28417 Opcode = AArch64::CMP_SWAP_128;
28418 break;
28419 default:
28420 llvm_unreachable("Unexpected ordering!");
28421 }
28422
28423 SDLoc DL(N);
28424 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
28425 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
28426 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
28427 New.first, New.second, N->getOperand(0)};
28428 SDNode *CmpSwap = DAG.getMachineNode(
28429 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
28430 Ops);
28431 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
28432
28433 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
28434 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
28435 Results.push_back(SDValue(CmpSwap, 3));
28436}
28437
28438static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
28439 AtomicOrdering Ordering) {
28440 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
28441 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
28442 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
28443 // ATOMIC_LOAD_CLR at any point.
28444 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
28445 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
28446 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
28447 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
28448
28449 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
28450 // The operand will need to be XORed in a separate step.
28451 switch (Ordering) {
28453 return AArch64::LDCLRP;
28454 break;
28456 return AArch64::LDCLRPA;
28457 break;
28459 return AArch64::LDCLRPL;
28460 break;
28463 return AArch64::LDCLRPAL;
28464 break;
28465 default:
28466 llvm_unreachable("Unexpected ordering!");
28467 }
28468 }
28469
28470 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
28471 switch (Ordering) {
28473 return AArch64::LDSETP;
28474 break;
28476 return AArch64::LDSETPA;
28477 break;
28479 return AArch64::LDSETPL;
28480 break;
28483 return AArch64::LDSETPAL;
28484 break;
28485 default:
28486 llvm_unreachable("Unexpected ordering!");
28487 }
28488 }
28489
28490 if (ISDOpcode == ISD::ATOMIC_SWAP) {
28491 switch (Ordering) {
28493 return AArch64::SWPP;
28494 break;
28496 return AArch64::SWPPA;
28497 break;
28499 return AArch64::SWPPL;
28500 break;
28503 return AArch64::SWPPAL;
28504 break;
28505 default:
28506 llvm_unreachable("Unexpected ordering!");
28507 }
28508 }
28509
28510 llvm_unreachable("Unexpected ISDOpcode!");
28511}
28512
28515 SelectionDAG &DAG,
28516 const AArch64Subtarget *Subtarget) {
28517 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
28518 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
28519 // rather than the CASP instructions, because CASP has register classes for
28520 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
28521 // to present them as single operands. LSE128 instructions use the GPR64
28522 // register class (because the pair does not have to be sequential), like
28523 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
28524
28525 assert(N->getValueType(0) == MVT::i128 &&
28526 "AtomicLoadXXX on types less than 128 should be legal");
28527
28528 if (!Subtarget->hasLSE128())
28529 return;
28530
28531 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
28532 const SDValue &Chain = N->getOperand(0);
28533 const SDValue &Ptr = N->getOperand(1);
28534 const SDValue &Val128 = N->getOperand(2);
28535 std::pair<SDValue, SDValue> Val2x64 =
28536 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
28537
28538 const unsigned ISDOpcode = N->getOpcode();
28539 const unsigned MachineOpcode =
28540 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
28541
28542 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
28543 SDLoc DL(Val128);
28544 Val2x64.first =
28545 DAG.getNode(ISD::XOR, DL, MVT::i64,
28546 DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.first);
28547 Val2x64.second =
28548 DAG.getNode(ISD::XOR, DL, MVT::i64,
28549 DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.second);
28550 }
28551
28552 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
28553 if (DAG.getDataLayout().isBigEndian())
28554 std::swap(Ops[0], Ops[1]);
28555
28556 MachineSDNode *AtomicInst =
28557 DAG.getMachineNode(MachineOpcode, SDLoc(N),
28558 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
28559
28560 DAG.setNodeMemRefs(AtomicInst, {MemOp});
28561
28562 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
28563 if (DAG.getDataLayout().isBigEndian())
28564 std::swap(Lo, Hi);
28565
28566 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
28567 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
28568}
28569
28570void AArch64TargetLowering::ReplaceNodeResults(
28572 switch (N->getOpcode()) {
28573 default:
28574 llvm_unreachable("Don't know how to custom expand this");
28575 case ISD::BITCAST:
28576 ReplaceBITCASTResults(N, Results, DAG);
28577 return;
28578 case ISD::VECREDUCE_ADD:
28579 case ISD::VECREDUCE_SMAX:
28580 case ISD::VECREDUCE_SMIN:
28581 case ISD::VECREDUCE_UMAX:
28582 case ISD::VECREDUCE_UMIN:
28583 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
28584 return;
28586 if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
28587 Results.push_back(Res);
28588 return;
28589 case ISD::ADD:
28590 case ISD::FADD:
28591 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
28592 return;
28593
28594 case ISD::CTPOP:
28595 case ISD::PARITY:
28596 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
28597 Results.push_back(Result);
28598 return;
28599 case AArch64ISD::SADDV:
28600 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
28601 return;
28602 case AArch64ISD::UADDV:
28603 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
28604 return;
28605 case AArch64ISD::SMINV:
28606 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
28607 return;
28608 case AArch64ISD::UMINV:
28609 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
28610 return;
28611 case AArch64ISD::SMAXV:
28612 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
28613 return;
28614 case AArch64ISD::UMAXV:
28615 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
28616 return;
28617 case ISD::MULHS:
28619 Results.push_back(
28620 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
28621 return;
28622 case ISD::MULHU:
28624 Results.push_back(
28625 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
28626 return;
28627 case ISD::FP_TO_UINT:
28628 case ISD::FP_TO_SINT:
28631 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
28632 // Let normal code take care of it by not adding anything to Results.
28633 return;
28634 case ISD::ATOMIC_CMP_SWAP:
28635 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
28636 return;
28637 case ISD::ATOMIC_LOAD_CLR:
28638 assert(N->getValueType(0) != MVT::i128 &&
28639 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
28640 break;
28641 case ISD::ATOMIC_LOAD_AND:
28642 case ISD::ATOMIC_LOAD_OR:
28643 case ISD::ATOMIC_SWAP: {
28644 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
28645 "Expected 128-bit atomicrmw.");
28646 // These need custom type legalisation so we go directly to instruction.
28647 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
28648 return;
28649 }
28650 case ISD::ADDRSPACECAST: {
28651 SDValue V = LowerADDRSPACECAST(SDValue(N, 0), DAG);
28652 Results.push_back(V);
28653 return;
28654 }
28655 case ISD::ATOMIC_LOAD:
28656 case ISD::LOAD: {
28657 MemSDNode *LoadNode = cast<MemSDNode>(N);
28658 EVT MemVT = LoadNode->getMemoryVT();
28659 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
28660 // targets.
28661 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
28662 MemVT.getSizeInBits() == 256u &&
28663 (MemVT.getScalarSizeInBits() == 8u ||
28664 MemVT.getScalarSizeInBits() == 16u ||
28665 MemVT.getScalarSizeInBits() == 32u ||
28666 MemVT.getScalarSizeInBits() == 64u)) {
28667
28668 EVT HalfVT = MemVT.getHalfNumVectorElementsVT(*DAG.getContext());
28670 AArch64ISD::LDNP, SDLoc(N),
28671 DAG.getVTList({MVT::v2i64, MVT::v2i64, MVT::Other}),
28672 {LoadNode->getChain(), LoadNode->getBasePtr()},
28673 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
28674
28675 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
28676 DAG.getBitcast(HalfVT, Result.getValue(0)),
28677 DAG.getBitcast(HalfVT, Result.getValue(1)));
28678 Results.append({Pair, Result.getValue(2) /* Chain */});
28679 return;
28680 }
28681
28682 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
28683 LoadNode->getMemoryVT() != MVT::i128) {
28684 // Non-volatile or atomic loads are optimized later in AArch64's load/store
28685 // optimizer.
28686 return;
28687 }
28688
28689 if (SDValue(N, 0).getValueType() == MVT::i128) {
28690 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
28691 bool isLoadAcquire =
28693 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
28694
28695 if (isLoadAcquire)
28696 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
28697
28699 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
28700 {LoadNode->getChain(), LoadNode->getBasePtr()},
28701 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
28702
28703 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
28704
28705 SDValue Pair =
28706 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
28707 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
28708 Results.append({Pair, Result.getValue(2) /* Chain */});
28709 }
28710 return;
28711 }
28713 ReplaceExtractSubVectorResults(N, Results, DAG);
28714 return;
28717 // Custom lowering has been requested for INSERT_SUBVECTOR and
28718 // CONCAT_VECTORS -- but delegate to common code for result type
28719 // legalisation
28720 return;
28721 case ISD::GET_ACTIVE_LANE_MASK:
28722 ReplaceGetActiveLaneMaskResults(N, Results, DAG);
28723 return;
28725 EVT VT = N->getValueType(0);
28726
28727 Intrinsic::ID IntID =
28728 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
28729 switch (IntID) {
28730 default:
28731 return;
28732 case Intrinsic::aarch64_sve_clasta_n: {
28733 assert((VT == MVT::i8 || VT == MVT::i16) &&
28734 "custom lowering for unexpected type");
28735 SDLoc DL(N);
28736 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
28737 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
28738 N->getOperand(1), Op2, N->getOperand(3));
28739 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28740 return;
28741 }
28742 case Intrinsic::aarch64_sve_clastb_n: {
28743 assert((VT == MVT::i8 || VT == MVT::i16) &&
28744 "custom lowering for unexpected type");
28745 SDLoc DL(N);
28746 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
28747 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
28748 N->getOperand(1), Op2, N->getOperand(3));
28749 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28750 return;
28751 }
28752 case Intrinsic::aarch64_sve_lasta: {
28753 assert((VT == MVT::i8 || VT == MVT::i16) &&
28754 "custom lowering for unexpected type");
28755 SDLoc DL(N);
28756 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
28757 N->getOperand(1), N->getOperand(2));
28758 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28759 return;
28760 }
28761 case Intrinsic::aarch64_sve_lastb: {
28762 assert((VT == MVT::i8 || VT == MVT::i16) &&
28763 "custom lowering for unexpected type");
28764 SDLoc DL(N);
28765 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
28766 N->getOperand(1), N->getOperand(2));
28767 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28768 return;
28769 }
28770 case Intrinsic::aarch64_sme_in_streaming_mode: {
28771 SDLoc DL(N);
28772 SDValue Chain = DAG.getEntryNode();
28773
28774 SDValue RuntimePStateSM =
28775 getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
28776 Results.push_back(
28777 DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM));
28778 return;
28779 }
28780 case Intrinsic::experimental_vector_match: {
28781 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
28782 return;
28783
28784 // NOTE: Only trivial type promotion is supported.
28785 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
28786 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
28787 return;
28788
28789 SDLoc DL(N);
28790 auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
28791 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
28792 return;
28793 }
28794 }
28795 }
28796 case ISD::READ_REGISTER: {
28797 SDLoc DL(N);
28798 assert(N->getValueType(0) == MVT::i128 &&
28799 "READ_REGISTER custom lowering is only for 128-bit sysregs");
28800 SDValue Chain = N->getOperand(0);
28801 SDValue SysRegName = N->getOperand(1);
28802
28803 SDValue Result = DAG.getNode(
28804 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
28805 Chain, SysRegName);
28806
28807 // Sysregs are not endian. Result.getValue(0) always contains the lower half
28808 // of the 128-bit System Register value.
28809 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
28810 Result.getValue(0), Result.getValue(1));
28811 Results.push_back(Pair);
28812 Results.push_back(Result.getValue(2)); // Chain
28813 return;
28814 }
28815 }
28816}
28817
28819 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
28821 return true;
28822}
28823
28825 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
28826 // reciprocal if there are three or more FDIVs.
28827 return 3;
28828}
28829
28832 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
28833 // v4i16, v2i32 instead of to promote.
28834 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
28835 VT == MVT::v1f32)
28836 return TypeWidenVector;
28837
28839}
28840
28841// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
28842// provided the address is 16-byte aligned.
28844 if (!Subtarget->hasLSE2())
28845 return false;
28846
28847 if (auto LI = dyn_cast<LoadInst>(I))
28848 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
28849 LI->getAlign() >= Align(16);
28850
28851 if (auto SI = dyn_cast<StoreInst>(I))
28852 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
28853 SI->getAlign() >= Align(16);
28854
28855 return false;
28856}
28857
28859 if (!Subtarget->hasLSE128())
28860 return false;
28861
28862 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
28863 // will clobber the two registers.
28864 if (const auto *SI = dyn_cast<StoreInst>(I))
28865 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
28866 SI->getAlign() >= Align(16) &&
28867 (SI->getOrdering() == AtomicOrdering::Release ||
28868 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
28869
28870 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
28871 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
28872 RMW->getAlign() >= Align(16) &&
28873 (RMW->getOperation() == AtomicRMWInst::Xchg ||
28874 RMW->getOperation() == AtomicRMWInst::And ||
28875 RMW->getOperation() == AtomicRMWInst::Or);
28876
28877 return false;
28878}
28879
28881 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
28882 return false;
28883
28884 if (auto LI = dyn_cast<LoadInst>(I))
28885 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
28886 LI->getAlign() >= Align(16) &&
28887 LI->getOrdering() == AtomicOrdering::Acquire;
28888
28889 if (auto SI = dyn_cast<StoreInst>(I))
28890 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
28891 SI->getAlign() >= Align(16) &&
28892 SI->getOrdering() == AtomicOrdering::Release;
28893
28894 return false;
28895}
28896
28898 const Instruction *I) const {
28900 return false;
28902 return false;
28904 return true;
28905 return false;
28906}
28907
28909 const Instruction *I) const {
28910 // Store-Release instructions only provide seq_cst guarantees when paired with
28911 // Load-Acquire instructions. MSVC CRT does not use these instructions to
28912 // implement seq_cst loads and stores, so we need additional explicit fences
28913 // after memory writes.
28914 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
28915 return false;
28916
28917 switch (I->getOpcode()) {
28918 default:
28919 return false;
28920 case Instruction::AtomicCmpXchg:
28921 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
28923 case Instruction::AtomicRMW:
28924 return cast<AtomicRMWInst>(I)->getOrdering() ==
28926 case Instruction::Store:
28927 return cast<StoreInst>(I)->getOrdering() ==
28929 }
28930}
28931
28932// Loads and stores less than 128-bits are already atomic; ones above that
28933// are doomed anyway, so defer to the default libcall and blame the OS when
28934// things go wrong.
28937 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
28938 if (Size != 128)
28947}
28948
28949// Loads and stores less than 128-bits are already atomic; ones above that
28950// are doomed anyway, so defer to the default libcall and blame the OS when
28951// things go wrong.
28954 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
28955
28956 if (Size != 128)
28958 if (isOpSuitableForRCPC3(LI))
28960 // No LSE128 loads
28961 if (isOpSuitableForLDPSTP(LI))
28963
28964 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
28965 // implement atomicrmw without spilling. If the target address is also on the
28966 // stack and close enough to the spill slot, this can lead to a situation
28967 // where the monitor always gets cleared and the atomic operation can never
28968 // succeed. So at -O0 lower this operation to a CAS loop.
28969 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
28971
28972 // Using CAS for an atomic load has a better chance of succeeding under high
28973 // contention situations. So use it if available.
28974 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
28976}
28977
28978// Return true if the atomic operation expansion will lower to use a library
28979// call, and is thus ineligible to use an LLSC expansion.
28980static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
28981 const AtomicRMWInst *RMW) {
28982 if (!RMW->isFloatingPointOperation())
28983 return false;
28984 switch (RMW->getType()->getScalarType()->getTypeID()) {
28985 case Type::FloatTyID:
28986 case Type::DoubleTyID:
28987 case Type::HalfTyID:
28988 case Type::BFloatTyID:
28989 // Will use soft float
28990 return !Subtarget.hasFPARMv8();
28991 default:
28992 // fp128 will emit library calls.
28993 return true;
28994 }
28995
28996 llvm_unreachable("covered type switch");
28997}
28998
28999// The "default" for integer RMW operations is to expand to an LL/SC loop.
29000// However, with the LSE instructions (or outline-atomics mode, which provides
29001// library routines in place of the LSE-instructions), we can directly emit many
29002// operations instead.
29005 Type *Ty = AI->getType();
29006 unsigned Size = Ty->getPrimitiveSizeInBits();
29007 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
29008
29009 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
29013 if (CanUseLSE128)
29015
29016 // If LSFE available, use atomic FP instructions in preference to expansion
29017 if (Subtarget->hasLSFE() && (AI->getOperation() == AtomicRMWInst::FAdd ||
29023
29024 // Nand is not supported in LSE.
29025 // Leave 128 bits to LLSC or CmpXChg.
29026 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&
29027 !AI->isFloatingPointOperation()) {
29028 if (Subtarget->hasLSE())
29030 if (Subtarget->outlineAtomics()) {
29031 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
29032 // Don't outline them unless
29033 // (1) high level <atomic> support approved:
29034 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
29035 // (2) low level libgcc and compiler-rt support implemented by:
29036 // min/max outline atomics helpers
29037 if (AI->getOperation() != AtomicRMWInst::Min &&
29042 }
29043 }
29044 }
29045
29046 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
29047 // implement atomicrmw without spilling. If the target address is also on the
29048 // stack and close enough to the spill slot, this can lead to a situation
29049 // where the monitor always gets cleared and the atomic operation can never
29050 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
29051 // we have a single CAS instruction that can replace the loop.
29052 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||
29053 Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI))
29055
29057}
29058
29061 AtomicCmpXchgInst *AI) const {
29062 // If subtarget has LSE, leave cmpxchg intact for codegen.
29063 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
29065 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
29066 // implement cmpxchg without spilling. If the address being exchanged is also
29067 // on the stack and close enough to the spill slot, this can lead to a
29068 // situation where the monitor always gets cleared and the atomic operation
29069 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
29070 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
29072
29073 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
29074 // it.
29076 if (Size > 64)
29078
29080}
29081
29083 Type *ValueTy, Value *Addr,
29084 AtomicOrdering Ord) const {
29085 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29086 bool IsAcquire = isAcquireOrStronger(Ord);
29087
29088 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
29089 // intrinsic must return {i64, i64} and we have to recombine them into a
29090 // single i128 here.
29091 if (ValueTy->getPrimitiveSizeInBits() == 128) {
29093 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
29094
29095 Value *LoHi =
29096 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
29097
29098 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
29099 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
29100
29101 auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
29102 Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
29103 Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
29104
29105 Value *Or = Builder.CreateOr(
29106 Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
29107 return Builder.CreateBitCast(Or, ValueTy);
29108 }
29109
29110 Type *Tys[] = { Addr->getType() };
29112 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
29113
29114 const DataLayout &DL = M->getDataLayout();
29115 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
29116 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
29117 CI->addParamAttr(0, Attribute::get(Builder.getContext(),
29118 Attribute::ElementType, IntEltTy));
29119 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
29120
29121 return Builder.CreateBitCast(Trunc, ValueTy);
29122}
29123
29125 IRBuilderBase &Builder) const {
29126 Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {});
29127}
29128
29130 Value *Val, Value *Addr,
29131 AtomicOrdering Ord) const {
29132 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29133 bool IsRelease = isReleaseOrStronger(Ord);
29134
29135 // Since the intrinsics must have legal type, the i128 intrinsics take two
29136 // parameters: "i64, i64". We must marshal Val into the appropriate form
29137 // before the call.
29138 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
29140 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
29142 Type *Int64Ty = Type::getInt64Ty(M->getContext());
29143 Type *Int128Ty = Type::getInt128Ty(M->getContext());
29144
29145 Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
29146
29147 Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
29148 Value *Hi =
29149 Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
29150 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
29151 }
29152
29154 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
29155 Type *Tys[] = { Addr->getType() };
29157
29158 const DataLayout &DL = M->getDataLayout();
29159 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
29160 Val = Builder.CreateBitCast(Val, IntValTy);
29161
29162 CallInst *CI = Builder.CreateCall(
29163 Stxr, {Builder.CreateZExtOrBitCast(
29164 Val, Stxr->getFunctionType()->getParamType(0)),
29165 Addr});
29166 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
29167 Attribute::ElementType, Val->getType()));
29168 return CI;
29169}
29170
29172 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
29173 const DataLayout &DL) const {
29174 if (!Ty->isArrayTy()) {
29175 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
29176 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
29177 }
29178
29179 // All non aggregate members of the type must have the same type
29180 SmallVector<EVT> ValueVTs;
29181 ComputeValueVTs(*this, DL, Ty, ValueVTs);
29182 return all_equal(ValueVTs);
29183}
29184
29185bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
29186 EVT) const {
29187 return false;
29188}
29189
29190static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
29191 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
29192 Function *ThreadPointerFunc = Intrinsic::getOrInsertDeclaration(
29193 M, Intrinsic::thread_pointer, IRB.getPtrTy());
29194 return IRB.CreatePointerCast(
29195 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
29196 Offset),
29197 IRB.getPtrTy(0));
29198}
29199
29201 // Android provides a fixed TLS slot for the stack cookie. See the definition
29202 // of TLS_SLOT_STACK_GUARD in
29203 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
29204 if (Subtarget->isTargetAndroid())
29205 return UseTlsOffset(IRB, 0x28);
29206
29207 // Fuchsia is similar.
29208 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
29209 if (Subtarget->isTargetFuchsia())
29210 return UseTlsOffset(IRB, -0x10);
29211
29213}
29214
29216 // MSVC CRT provides functionalities for stack protection.
29217 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
29218 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
29219
29220 RTLIB::LibcallImpl SecurityCookieVar =
29221 getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
29222 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
29223 SecurityCookieVar != RTLIB::Unsupported) {
29224 // MSVC CRT has a global variable holding security cookie.
29225 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
29226 PointerType::getUnqual(M.getContext()));
29227
29228 // MSVC CRT has a function to validate security cookie.
29229 FunctionCallee SecurityCheckCookie =
29230 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
29231 Type::getVoidTy(M.getContext()),
29232 PointerType::getUnqual(M.getContext()));
29233 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
29234 F->setCallingConv(CallingConv::Win64);
29235 F->addParamAttr(0, Attribute::AttrKind::InReg);
29236 }
29237 return;
29238 }
29240}
29241
29243 // MSVC CRT has a function to validate security cookie.
29244 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
29245 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
29246 if (SecurityCheckCookieLibcall != RTLIB::Unsupported)
29247 return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall));
29249}
29250
29251Value *
29253 // Android provides a fixed TLS slot for the SafeStack pointer. See the
29254 // definition of TLS_SLOT_SAFESTACK in
29255 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
29256 if (Subtarget->isTargetAndroid())
29257 return UseTlsOffset(IRB, 0x48);
29258
29259 // Fuchsia is similar.
29260 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
29261 if (Subtarget->isTargetFuchsia())
29262 return UseTlsOffset(IRB, -0x8);
29263
29265}
29266
29267/// If a physical register, this returns the register that receives the
29268/// exception address on entry to an EH pad.
29270 const Constant *PersonalityFn) const {
29271 // FIXME: This is a guess. Has this been defined yet?
29272 return AArch64::X0;
29273}
29274
29275/// If a physical register, this returns the register that receives the
29276/// exception typeid on entry to a landing pad.
29278 const Constant *PersonalityFn) const {
29279 // FIXME: This is a guess. Has this been defined yet?
29280 return AArch64::X1;
29281}
29282
29284 const Instruction &AndI) const {
29285 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
29286 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
29287 // may be beneficial to sink in other cases, but we would have to check that
29288 // the cmp would not get folded into the br to form a cbz for these to be
29289 // beneficial.
29291 if (!Mask)
29292 return false;
29293 return Mask->getValue().isPowerOf2();
29294}
29295
29299 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
29300 SelectionDAG &DAG) const {
29301 // Does baseline recommend not to perform the fold by default?
29303 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
29304 return false;
29305 // Else, if this is a vector shift, prefer 'shl'.
29306 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
29307}
29308
29311 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
29313 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
29316 ExpansionFactor);
29317}
29318
29320 // Update IsSplitCSR in AArch64unctionInfo.
29321 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
29322 AFI->setIsSplitCSR(true);
29323}
29324
29326 MachineBasicBlock *Entry,
29327 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
29328 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
29329 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
29330 if (!IStart)
29331 return;
29332
29333 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
29334 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
29335 MachineBasicBlock::iterator MBBI = Entry->begin();
29336 for (const MCPhysReg *I = IStart; *I; ++I) {
29337 const TargetRegisterClass *RC = nullptr;
29338 if (AArch64::GPR64RegClass.contains(*I))
29339 RC = &AArch64::GPR64RegClass;
29340 else if (AArch64::FPR64RegClass.contains(*I))
29341 RC = &AArch64::FPR64RegClass;
29342 else
29343 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
29344
29345 Register NewVR = MRI->createVirtualRegister(RC);
29346 // Create copy from CSR to a virtual register.
29347 // FIXME: this currently does not emit CFI pseudo-instructions, it works
29348 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
29349 // nounwind. If we want to generalize this later, we may need to emit
29350 // CFI pseudo-instructions.
29351 assert(Entry->getParent()->getFunction().hasFnAttribute(
29352 Attribute::NoUnwind) &&
29353 "Function should be nounwind in insertCopiesSplitCSR!");
29354 Entry->addLiveIn(*I);
29355 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
29356 .addReg(*I);
29357
29358 // Insert the copy-back instructions right before the terminator.
29359 for (auto *Exit : Exits)
29360 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
29361 TII->get(TargetOpcode::COPY), *I)
29362 .addReg(NewVR);
29363 }
29364}
29365
29366bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
29367 // Integer division on AArch64 is expensive. However, when aggressively
29368 // optimizing for code size, we prefer to use a div instruction, as it is
29369 // usually smaller than the alternative sequence.
29370 // The exception to this is vector division. Since AArch64 doesn't have vector
29371 // integer division, leaving the division as-is is a loss even in terms of
29372 // size, because it will have to be scalarized, while the alternative code
29373 // sequence can be performed in vector form.
29374 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
29375 return OptSize && !VT.isVector();
29376}
29377
29379 const MachineFunction &MF) const {
29380 // Avoid merging stores into fixed-length vectors when Neon is unavailable.
29381 // In future, we could allow this when SVE is available, but currently,
29382 // the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and
29383 // the general lowering may introduce stack spills/reloads).
29384 if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
29385 return false;
29386
29387 // Do not merge to float value size (128 bytes) if no implicit float attribute
29388 // is set.
29389 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
29390 return !NoFloat || MemVT.getSizeInBits() <= 64;
29391}
29392
29394 // We want inc-of-add for scalars and sub-of-not for vectors.
29395 return VT.isScalarInteger();
29396}
29397
29399 EVT VT) const {
29400 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
29401 // legalize.
29402 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
29403 return false;
29404 if (FPVT == MVT::v8bf16)
29405 return false;
29406 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
29407}
29408
29410 // Expand scalar and SVE operations using selects. Neon vectors prefer sub to
29411 // avoid vselect becoming bsl / unrolling.
29412 return !VT.isFixedLengthVector();
29413}
29414
29418 const TargetInstrInfo *TII) const {
29419 assert(MBBI->isCall() && MBBI->getCFIType() &&
29420 "Invalid call instruction for a KCFI check");
29421
29422 switch (MBBI->getOpcode()) {
29423 case AArch64::BLR:
29424 case AArch64::BLRNoIP:
29425 case AArch64::TCRETURNri:
29426 case AArch64::TCRETURNrix16x17:
29427 case AArch64::TCRETURNrix17:
29428 case AArch64::TCRETURNrinotx16:
29429 break;
29430 default:
29431 llvm_unreachable("Unexpected CFI call opcode");
29432 }
29433
29434 MachineOperand &Target = MBBI->getOperand(0);
29435 assert(Target.isReg() && "Invalid target operand for an indirect call");
29436 Target.setIsRenamable(false);
29437
29438 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
29439 .addReg(Target.getReg())
29440 .addImm(MBBI->getCFIType())
29441 .getInstr();
29442}
29443
29445 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
29446}
29447
29448unsigned
29450 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
29451 return getPointerTy(DL).getSizeInBits();
29452
29453 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
29454}
29455
29456void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
29457 MachineFrameInfo &MFI = MF.getFrameInfo();
29458 // If we have any vulnerable SVE stack objects then the stack protector
29459 // needs to be placed at the top of the SVE stack area, as the SVE locals
29460 // are placed above the other locals, so we allocate it as if it were a
29461 // scalable vector.
29462 // FIXME: It may be worthwhile having a specific interface for this rather
29463 // than doing it here in finalizeLowering.
29464 if (MFI.hasStackProtectorIndex()) {
29465 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
29471 break;
29472 }
29473 }
29474 }
29477}
29478
29479// Unlike X86, we let frame lowering assign offsets to all catch objects.
29481
29482bool AArch64TargetLowering::shouldLocalize(
29483 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
29484 auto &MF = *MI.getMF();
29485 auto &MRI = MF.getRegInfo();
29486 auto maxUses = [](unsigned RematCost) {
29487 // A cost of 1 means remats are basically free.
29488 if (RematCost == 1)
29489 return std::numeric_limits<unsigned>::max();
29490 if (RematCost == 2)
29491 return 2U;
29492
29493 // Remat is too expensive, only sink if there's one user.
29494 if (RematCost > 2)
29495 return 1U;
29496 llvm_unreachable("Unexpected remat cost");
29497 };
29498
29499 unsigned Opc = MI.getOpcode();
29500 switch (Opc) {
29501 case TargetOpcode::G_GLOBAL_VALUE: {
29502 // On Darwin, TLS global vars get selected into function calls, which
29503 // we don't want localized, as they can get moved into the middle of a
29504 // another call sequence.
29505 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
29506 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
29507 return false;
29508 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
29509 }
29510 case TargetOpcode::G_FCONSTANT:
29511 case TargetOpcode::G_CONSTANT: {
29512 const ConstantInt *CI;
29513 unsigned AdditionalCost = 0;
29514
29515 if (Opc == TargetOpcode::G_CONSTANT)
29516 CI = MI.getOperand(1).getCImm();
29517 else {
29518 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
29519 // We try to estimate cost of 32/64b fpimms, as they'll likely be
29520 // materialized as integers.
29521 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
29522 break;
29523 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
29524 bool OptForSize = MF.getFunction().hasOptSize();
29526 OptForSize))
29527 return true; // Constant should be cheap.
29528 CI =
29529 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
29530 // FP materialization also costs an extra move, from gpr to fpr.
29531 AdditionalCost = 1;
29532 }
29533 APInt Imm = CI->getValue();
29536 assert(Cost.isValid() && "Expected a valid imm cost");
29537
29538 unsigned RematCost = Cost.getValue();
29539 RematCost += AdditionalCost;
29540 Register Reg = MI.getOperand(0).getReg();
29541 unsigned MaxUses = maxUses(RematCost);
29542 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
29543 if (MaxUses == std::numeric_limits<unsigned>::max())
29544 --MaxUses;
29545 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
29546 }
29547 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
29548 // localizable.
29549 case AArch64::ADRP:
29550 case AArch64::G_ADD_LOW:
29551 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
29552 case TargetOpcode::G_PTR_ADD:
29553 return true;
29554 default:
29555 break;
29556 }
29558}
29559
29561 // Fallback for scalable vectors.
29562 // Note that if EnableSVEGISel is true, we allow scalable vector types for
29563 // all instructions, regardless of whether they are actually supported.
29564 if (!EnableSVEGISel) {
29565 if (Inst.getType()->isScalableTy()) {
29566 return true;
29567 }
29568
29569 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
29570 if (Inst.getOperand(i)->getType()->isScalableTy())
29571 return true;
29572
29573 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
29574 if (AI->getAllocatedType()->isScalableTy())
29575 return true;
29576 }
29577 }
29578
29579 // Checks to allow the use of SME instructions
29580 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
29581 auto CallAttrs = SMECallAttrs(*Base, this);
29582 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
29583 CallAttrs.requiresPreservingZT0() ||
29584 CallAttrs.requiresPreservingAllZAState())
29585 return true;
29586 }
29587 return false;
29588}
29589
29590// Return the largest legal scalable vector type that matches VT's element type.
29594 "Expected legal fixed length vector!");
29595 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
29596 default:
29597 llvm_unreachable("unexpected element type for SVE container");
29598 case MVT::i8:
29599 return EVT(MVT::nxv16i8);
29600 case MVT::i16:
29601 return EVT(MVT::nxv8i16);
29602 case MVT::i32:
29603 return EVT(MVT::nxv4i32);
29604 case MVT::i64:
29605 return EVT(MVT::nxv2i64);
29606 case MVT::bf16:
29607 return EVT(MVT::nxv8bf16);
29608 case MVT::f16:
29609 return EVT(MVT::nxv8f16);
29610 case MVT::f32:
29611 return EVT(MVT::nxv4f32);
29612 case MVT::f64:
29613 return EVT(MVT::nxv2f64);
29614 }
29615}
29616
29617// Return a predicate with active lanes corresponding to the extent of VT.
29619 EVT VT) {
29622 "Expected legal fixed length vector!");
29623
29624 std::optional<unsigned> PgPattern =
29626 assert(PgPattern && "Unexpected element count for SVE predicate");
29627
29628 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
29629 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
29630 // variants of instructions when available.
29631 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
29632 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
29633 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
29634 if (MaxSVESize && MinSVESize == MaxSVESize &&
29635 MaxSVESize == VT.getSizeInBits())
29636 PgPattern = AArch64SVEPredPattern::all;
29637
29638 MVT MaskVT;
29639 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
29640 default:
29641 llvm_unreachable("unexpected element type for SVE predicate");
29642 case MVT::i8:
29643 MaskVT = MVT::nxv16i1;
29644 break;
29645 case MVT::i16:
29646 case MVT::f16:
29647 case MVT::bf16:
29648 MaskVT = MVT::nxv8i1;
29649 break;
29650 case MVT::i32:
29651 case MVT::f32:
29652 MaskVT = MVT::nxv4i1;
29653 break;
29654 case MVT::i64:
29655 case MVT::f64:
29656 MaskVT = MVT::nxv2i1;
29657 break;
29658 }
29659
29660 return getPTrue(DAG, DL, MaskVT, *PgPattern);
29661}
29662
29664 EVT VT) {
29666 "Expected legal scalable vector!");
29667 auto PredTy = VT.changeVectorElementType(MVT::i1);
29668 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
29669}
29670
29672 if (VT.isFixedLengthVector())
29673 return getPredicateForFixedLengthVector(DAG, DL, VT);
29674
29675 return getPredicateForScalableVector(DAG, DL, VT);
29676}
29677
29678// Grow V to consume an entire SVE register.
29680 assert(VT.isScalableVector() &&
29681 "Expected to convert into a scalable vector!");
29682 assert(V.getValueType().isFixedLengthVector() &&
29683 "Expected a fixed length vector operand!");
29684 SDLoc DL(V);
29685 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
29686 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
29687}
29688
29689// Shrink V so it's just big enough to maintain a VT's worth of data.
29692 "Expected to convert into a fixed length vector!");
29693 assert(V.getValueType().isScalableVector() &&
29694 "Expected a scalable vector operand!");
29695 SDLoc DL(V);
29696 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
29697 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
29698}
29699
29700// Convert all fixed length vector loads larger than NEON to masked_loads.
29701SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
29702 SDValue Op, SelectionDAG &DAG) const {
29703 auto Load = cast<LoadSDNode>(Op);
29704
29705 SDLoc DL(Op);
29706 EVT VT = Op.getValueType();
29707 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29708 EVT LoadVT = ContainerVT;
29709 EVT MemVT = Load->getMemoryVT();
29710
29711 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
29712
29713 if (VT.isFloatingPoint()) {
29714 LoadVT = ContainerVT.changeTypeToInteger();
29715 MemVT = MemVT.changeTypeToInteger();
29716 }
29717
29718 SDValue NewLoad = DAG.getMaskedLoad(
29719 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
29720 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
29721 Load->getAddressingMode(), Load->getExtensionType());
29722
29723 SDValue Result = NewLoad;
29724 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
29725 EVT ExtendVT = ContainerVT.changeVectorElementType(
29726 Load->getMemoryVT().getVectorElementType());
29727
29728 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
29729 Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
29730 Pg, Result, DAG.getUNDEF(ContainerVT));
29731 } else if (VT.isFloatingPoint()) {
29732 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
29733 }
29734
29735 Result = convertFromScalableVector(DAG, VT, Result);
29736 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
29737 return DAG.getMergeValues(MergedValues, DL);
29738}
29739
29741 SelectionDAG &DAG) {
29742 SDLoc DL(Mask);
29743 EVT InVT = Mask.getValueType();
29744 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
29746
29747 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
29748 return Pg;
29749
29750 bool InvertCond = false;
29751 if (isBitwiseNot(Mask)) {
29752 InvertCond = true;
29753 Mask = Mask.getOperand(0);
29754 }
29755
29756 SDValue Op1, Op2;
29757 ISD::CondCode CC;
29758
29759 // When Mask is the result of a SETCC, it's better to regenerate the compare.
29760 if (Mask.getOpcode() == ISD::SETCC) {
29761 Op1 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(0));
29762 Op2 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(1));
29763 CC = cast<CondCodeSDNode>(Mask.getOperand(2))->get();
29764 } else {
29765 Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
29766 Op2 = DAG.getConstant(0, DL, ContainerVT);
29767 CC = ISD::SETNE;
29768 }
29769
29770 if (InvertCond)
29771 CC = getSetCCInverse(CC, Op1.getValueType());
29772
29773 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
29774 {Pg, Op1, Op2, DAG.getCondCode(CC)});
29775}
29776
29777// Convert all fixed length vector loads larger than NEON to masked_loads.
29778SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
29779 SDValue Op, SelectionDAG &DAG) const {
29781
29782 SDLoc DL(Op);
29783 EVT VT = Op.getValueType();
29784 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29785
29786 SDValue Mask = Load->getMask();
29787 // If this is an extending load and the mask type is not the same as
29788 // load's type then we have to extend the mask type.
29789 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
29790 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
29791 "Incorrect mask type");
29792 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Mask);
29793 }
29795
29796 SDValue PassThru;
29797 bool IsPassThruZeroOrUndef = false;
29798
29799 if (Load->getPassThru()->isUndef()) {
29800 PassThru = DAG.getUNDEF(ContainerVT);
29801 IsPassThruZeroOrUndef = true;
29802 } else {
29803 if (ContainerVT.isInteger())
29804 PassThru = DAG.getConstant(0, DL, ContainerVT);
29805 else
29806 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
29807 if (isZerosVector(Load->getPassThru().getNode()))
29808 IsPassThruZeroOrUndef = true;
29809 }
29810
29811 SDValue NewLoad = DAG.getMaskedLoad(
29812 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
29813 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
29814 Load->getAddressingMode(), Load->getExtensionType());
29815
29816 SDValue Result = NewLoad;
29817 if (!IsPassThruZeroOrUndef) {
29818 SDValue OldPassThru =
29819 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
29820 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
29821 }
29822
29823 Result = convertFromScalableVector(DAG, VT, Result);
29824 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
29825 return DAG.getMergeValues(MergedValues, DL);
29826}
29827
29828// Convert all fixed length vector stores larger than NEON to masked_stores.
29829SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
29830 SDValue Op, SelectionDAG &DAG) const {
29831 auto Store = cast<StoreSDNode>(Op);
29832
29833 SDLoc DL(Op);
29834 EVT VT = Store->getValue().getValueType();
29835 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29836 EVT MemVT = Store->getMemoryVT();
29837
29838 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
29839 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
29840
29841 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
29842 EVT TruncVT = ContainerVT.changeVectorElementType(
29843 Store->getMemoryVT().getVectorElementType());
29844 MemVT = MemVT.changeTypeToInteger();
29845 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
29846 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
29847 DAG.getUNDEF(TruncVT));
29848 NewValue =
29849 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
29850 } else if (VT.isFloatingPoint()) {
29851 MemVT = MemVT.changeTypeToInteger();
29852 NewValue =
29853 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
29854 }
29855
29856 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
29857 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
29858 Store->getMemOperand(), Store->getAddressingMode(),
29859 Store->isTruncatingStore());
29860}
29861
29862SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
29863 SDValue Op, SelectionDAG &DAG) const {
29865
29866 SDLoc DL(Op);
29867 EVT VT = Store->getValue().getValueType();
29868 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29869
29870 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
29872
29873 return DAG.getMaskedStore(
29874 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
29875 Mask, Store->getMemoryVT(), Store->getMemOperand(),
29876 Store->getAddressingMode(), Store->isTruncatingStore());
29877}
29878
29879SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
29880 SDValue Op, SelectionDAG &DAG) const {
29881 SDLoc DL(Op);
29882 EVT VT = Op.getValueType();
29883 EVT EltVT = VT.getVectorElementType();
29884
29885 bool Signed = Op.getOpcode() == ISD::SDIV;
29886 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
29887
29888 bool Negated;
29889 uint64_t SplatVal;
29890 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
29891 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29892 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
29893 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32);
29894
29896 SDValue Res =
29897 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, ContainerVT, Pg, Op1, Op2);
29898 if (Negated)
29899 Res = DAG.getNode(ISD::SUB, DL, ContainerVT,
29900 DAG.getConstant(0, DL, ContainerVT), Res);
29901
29902 return convertFromScalableVector(DAG, VT, Res);
29903 }
29904
29905 // Scalable vector i32/i64 DIV is supported.
29906 if (EltVT == MVT::i32 || EltVT == MVT::i64)
29907 return LowerToPredicatedOp(Op, DAG, PredOpcode);
29908
29909 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
29910 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
29911 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
29912 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29913
29914 // If the wider type is legal: extend, op, and truncate.
29915 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
29916 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
29917 SDValue Op0 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(0));
29918 SDValue Op1 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(1));
29919 SDValue Div = DAG.getNode(Op.getOpcode(), DL, WideVT, Op0, Op1);
29920 return DAG.getNode(ISD::TRUNCATE, DL, VT, Div);
29921 }
29922
29923 auto HalveAndExtendVector = [&DAG, &DL, &HalfVT, &PromVT,
29924 &ExtendOpcode](SDValue Op) {
29925 SDValue IdxZero = DAG.getConstant(0, DL, MVT::i64);
29926 SDValue IdxHalf =
29927 DAG.getConstant(HalfVT.getVectorNumElements(), DL, MVT::i64);
29928 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxZero);
29929 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxHalf);
29930 return std::pair<SDValue, SDValue>(
29931 {DAG.getNode(ExtendOpcode, DL, PromVT, Lo),
29932 DAG.getNode(ExtendOpcode, DL, PromVT, Hi)});
29933 };
29934
29935 // If wider type is not legal: split, extend, op, trunc and concat.
29936 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
29937 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
29938 SDValue Lo = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0LoExt, Op1LoExt);
29939 SDValue Hi = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0HiExt, Op1HiExt);
29940 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Lo);
29941 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Hi);
29942 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoTrunc, HiTrunc});
29943}
29944
29945SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
29946 SDValue Op, SelectionDAG &DAG) const {
29947 EVT VT = Op.getValueType();
29948 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29949
29950 SDLoc DL(Op);
29951 SDValue Val = Op.getOperand(0);
29952 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
29953 Val = convertToScalableVector(DAG, ContainerVT, Val);
29954
29955 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
29956 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
29957
29958 // Repeatedly unpack Val until the result is of the desired element type.
29959 switch (ContainerVT.getSimpleVT().SimpleTy) {
29960 default:
29961 llvm_unreachable("unimplemented container type");
29962 case MVT::nxv16i8:
29963 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
29964 if (VT.getVectorElementType() == MVT::i16)
29965 break;
29966 [[fallthrough]];
29967 case MVT::nxv8i16:
29968 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
29969 if (VT.getVectorElementType() == MVT::i32)
29970 break;
29971 [[fallthrough]];
29972 case MVT::nxv4i32:
29973 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
29974 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
29975 break;
29976 }
29977
29978 return convertFromScalableVector(DAG, VT, Val);
29979}
29980
29981SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
29982 SDValue Op, SelectionDAG &DAG) const {
29983 EVT VT = Op.getValueType();
29984 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29985
29986 SDLoc DL(Op);
29987 SDValue Val = Op.getOperand(0);
29988 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
29989 Val = convertToScalableVector(DAG, ContainerVT, Val);
29990
29991 // Repeatedly truncate Val until the result is of the desired element type.
29992 switch (ContainerVT.getSimpleVT().SimpleTy) {
29993 default:
29994 llvm_unreachable("unimplemented container type");
29995 case MVT::nxv2i64:
29996 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
29997 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
29998 if (VT.getVectorElementType() == MVT::i32)
29999 break;
30000 [[fallthrough]];
30001 case MVT::nxv4i32:
30002 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
30003 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
30004 if (VT.getVectorElementType() == MVT::i16)
30005 break;
30006 [[fallthrough]];
30007 case MVT::nxv8i16:
30008 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
30009 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
30010 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
30011 break;
30012 }
30013
30014 return convertFromScalableVector(DAG, VT, Val);
30015}
30016
30017SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
30018 SDValue Op, SelectionDAG &DAG) const {
30019 EVT VT = Op.getValueType();
30020 EVT InVT = Op.getOperand(0).getValueType();
30021 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
30022
30023 SDLoc DL(Op);
30024 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30025 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
30026
30027 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
30028}
30029
30030SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
30031 SDValue Op, SelectionDAG &DAG) const {
30032 EVT VT = Op.getValueType();
30033 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30034
30035 SDLoc DL(Op);
30036 EVT InVT = Op.getOperand(0).getValueType();
30037 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30038 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
30039
30040 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
30041 Op.getOperand(1), Op.getOperand(2));
30042
30043 return convertFromScalableVector(DAG, VT, ScalableRes);
30044}
30045
30046// Convert vector operation 'Op' to an equivalent predicated operation whereby
30047// the original operation's type is used to construct a suitable predicate.
30048// NOTE: The results for inactive lanes are undefined.
30049SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
30050 SelectionDAG &DAG,
30051 unsigned NewOp) const {
30052 EVT VT = Op.getValueType();
30053 SDLoc DL(Op);
30054 auto Pg = getPredicateForVector(DAG, DL, VT);
30055
30056 if (VT.isFixedLengthVector()) {
30057 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
30058 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30059
30060 // Create list of operands by converting existing ones to scalable types.
30062 for (const SDValue &V : Op->op_values()) {
30063 if (isa<CondCodeSDNode>(V)) {
30064 Operands.push_back(V);
30065 continue;
30066 }
30067
30068 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
30069 EVT VTArg = VTNode->getVT().getVectorElementType();
30070 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
30071 Operands.push_back(DAG.getValueType(NewVTArg));
30072 continue;
30073 }
30074
30075 assert(isTypeLegal(V.getValueType()) &&
30076 "Expected only legal fixed-width types");
30077 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
30078 }
30079
30080 if (isMergePassthruOpcode(NewOp))
30081 Operands.push_back(DAG.getUNDEF(ContainerVT));
30082
30083 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
30084 return convertFromScalableVector(DAG, VT, ScalableRes);
30085 }
30086
30087 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
30088
30090 for (const SDValue &V : Op->op_values()) {
30091 assert((!V.getValueType().isVector() ||
30092 V.getValueType().isScalableVector()) &&
30093 "Only scalable vectors are supported!");
30094 Operands.push_back(V);
30095 }
30096
30097 if (isMergePassthruOpcode(NewOp))
30098 Operands.push_back(DAG.getUNDEF(VT));
30099
30100 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
30101}
30102
30103// If a fixed length vector operation has no side effects when applied to
30104// undefined elements, we can safely use scalable vectors to perform the same
30105// operation without needing to worry about predication.
30106SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
30107 SelectionDAG &DAG) const {
30108 EVT VT = Op.getValueType();
30110 "Only expected to lower fixed length vector operation!");
30111 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30112
30113 // Create list of operands by converting existing ones to scalable types.
30115 for (const SDValue &V : Op->op_values()) {
30116 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
30117
30118 // Pass through non-vector operands.
30119 if (!V.getValueType().isVector()) {
30120 Ops.push_back(V);
30121 continue;
30122 }
30123
30124 // "cast" fixed length vector to a scalable vector.
30125 assert(V.getValueType().isFixedLengthVector() &&
30126 isTypeLegal(V.getValueType()) &&
30127 "Only fixed length vectors are supported!");
30128 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
30129 }
30130
30131 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
30132 return convertFromScalableVector(DAG, VT, ScalableRes);
30133}
30134
30135SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
30136 SelectionDAG &DAG) const {
30137 SDLoc DL(ScalarOp);
30138 SDValue AccOp = ScalarOp.getOperand(0);
30139 SDValue VecOp = ScalarOp.getOperand(1);
30140 EVT SrcVT = VecOp.getValueType();
30141 EVT ResVT = SrcVT.getVectorElementType();
30142
30143 EVT ContainerVT = SrcVT;
30144 if (SrcVT.isFixedLengthVector()) {
30145 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
30146 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
30147 }
30148
30149 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
30150 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
30151
30152 // Convert operands to Scalable.
30153 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
30154 DAG.getUNDEF(ContainerVT), AccOp, Zero);
30155
30156 // Perform reduction.
30157 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
30158 Pg, AccOp, VecOp);
30159
30160 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
30161}
30162
30163SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
30164 SelectionDAG &DAG) const {
30165 SDLoc DL(ReduceOp);
30166 SDValue Op = ReduceOp.getOperand(0);
30167 EVT OpVT = Op.getValueType();
30168 EVT VT = ReduceOp.getValueType();
30169
30170 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
30171 return SDValue();
30172
30173 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
30174
30175 switch (ReduceOp.getOpcode()) {
30176 default:
30177 return SDValue();
30178 case ISD::VECREDUCE_OR:
30179 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
30180 // The predicate can be 'Op' because
30181 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
30182 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
30183 else
30184 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
30185 case ISD::VECREDUCE_AND: {
30186 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
30187 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
30188 }
30189 case ISD::VECREDUCE_XOR: {
30190 SDValue ID =
30191 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
30192 if (OpVT == MVT::nxv1i1) {
30193 // Emulate a CNTP on .Q using .D and a different governing predicate.
30194 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
30195 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
30196 }
30197 SDValue Cntp =
30198 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
30199 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
30200 }
30201 }
30202
30203 return SDValue();
30204}
30205
30206SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
30207 SDValue ScalarOp,
30208 SelectionDAG &DAG) const {
30209 SDLoc DL(ScalarOp);
30210 SDValue VecOp = ScalarOp.getOperand(0);
30211 EVT SrcVT = VecOp.getValueType();
30212
30214 SrcVT,
30215 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
30216 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
30217 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
30218 }
30219
30220 // Lower VECREDUCE_ADD of nxv2i1-nxv16i1 to CNTP rather than UADDV.
30221 if (ScalarOp.getOpcode() == ISD::VECREDUCE_ADD &&
30222 VecOp.getOpcode() == ISD::ZERO_EXTEND) {
30223 SDValue BoolVec = VecOp.getOperand(0);
30224 if (BoolVec.getValueType().getVectorElementType() == MVT::i1) {
30225 // CNTP(BoolVec & BoolVec) <=> CNTP(BoolVec & PTRUE)
30226 SDValue CntpOp = DAG.getNode(
30227 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
30228 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64),
30229 BoolVec, BoolVec);
30230 return DAG.getAnyExtOrTrunc(CntpOp, DL, ScalarOp.getValueType());
30231 }
30232 }
30233
30234 // UADDV always returns an i64 result.
30235 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
30236 SrcVT.getVectorElementType();
30237 EVT RdxVT = SrcVT;
30238 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
30239 RdxVT = getPackedSVEVectorVT(ResVT);
30240
30241 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
30242 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
30243 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
30244 Rdx, DAG.getConstant(0, DL, MVT::i64));
30245
30246 // The VEC_REDUCE nodes expect an element size result.
30247 if (ResVT != ScalarOp.getValueType())
30248 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
30249
30250 return Res;
30251}
30252
30253SDValue
30254AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
30255 SelectionDAG &DAG) const {
30256 EVT VT = Op.getValueType();
30257 SDLoc DL(Op);
30258
30259 EVT InVT = Op.getOperand(1).getValueType();
30260 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30261 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
30262 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
30263
30264 // Convert the mask to a predicated (NOTE: We don't need to worry about
30265 // inactive lanes since VSELECT is safe when given undefined elements).
30266 EVT MaskVT = Op.getOperand(0).getValueType();
30267 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
30268 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
30270 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
30271
30272 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
30273 Mask, Op1, Op2);
30274
30275 return convertFromScalableVector(DAG, VT, ScalableRes);
30276}
30277
30278SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
30279 SDValue Op, SelectionDAG &DAG) const {
30280 SDLoc DL(Op);
30281 EVT InVT = Op.getOperand(0).getValueType();
30282 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30283
30284 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
30285 "Only expected to lower fixed length vector operation!");
30286 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
30287 "Expected integer result of the same bit length as the inputs!");
30288
30289 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
30290 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
30291 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
30292
30293 EVT CmpVT = Pg.getValueType();
30294 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
30295 {Pg, Op1, Op2, Op.getOperand(2)});
30296
30297 EVT PromoteVT = ContainerVT.changeTypeToInteger();
30298 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
30299 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
30300}
30301
30302SDValue
30303AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
30304 SelectionDAG &DAG) const {
30305 SDLoc DL(Op);
30306 auto SrcOp = Op.getOperand(0);
30307 EVT VT = Op.getValueType();
30308 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
30309 EVT ContainerSrcVT =
30311
30312 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
30313 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
30314 return convertFromScalableVector(DAG, VT, Op);
30315}
30316
30317SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
30318 SDValue Op, SelectionDAG &DAG) const {
30319 SDLoc DL(Op);
30320 unsigned NumOperands = Op->getNumOperands();
30321
30322 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
30323 "Unexpected number of operands in CONCAT_VECTORS");
30324
30325 auto SrcOp1 = Op.getOperand(0);
30326 auto SrcOp2 = Op.getOperand(1);
30327 EVT VT = Op.getValueType();
30328 EVT SrcVT = SrcOp1.getValueType();
30329
30330 // Match a splat of 128b segments that fit in a single register.
30331 if (SrcVT.is128BitVector() && all_equal(Op.getNode()->op_values())) {
30332 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30333 SDValue Splat =
30334 DAG.getNode(AArch64ISD::DUPLANE128, DL, ContainerVT,
30335 convertToScalableVector(DAG, ContainerVT, SrcOp1),
30336 DAG.getConstant(0, DL, MVT::i64, /*isTarget=*/true));
30337 return convertFromScalableVector(DAG, VT, Splat);
30338 }
30339
30340 if (NumOperands > 2) {
30342 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
30343 for (unsigned I = 0; I < NumOperands; I += 2)
30344 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
30345 Op->getOperand(I), Op->getOperand(I + 1)));
30346
30347 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
30348 }
30349
30350 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30351
30353 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
30354 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
30355
30356 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
30357
30358 return convertFromScalableVector(DAG, VT, Op);
30359}
30360
30361SDValue
30362AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
30363 SelectionDAG &DAG) const {
30364 EVT VT = Op.getValueType();
30365 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30366
30367 SDLoc DL(Op);
30368 SDValue Val = Op.getOperand(0);
30369 SDValue Pg = getPredicateForVector(DAG, DL, VT);
30370 EVT SrcVT = Val.getValueType();
30371 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30372 EVT ExtendVT = ContainerVT.changeVectorElementType(
30373 SrcVT.getVectorElementType());
30374
30375 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
30376 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
30377
30378 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
30379 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
30380 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
30381 Pg, Val, DAG.getUNDEF(ContainerVT));
30382
30383 return convertFromScalableVector(DAG, VT, Val);
30384}
30385
30386SDValue
30387AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
30388 SelectionDAG &DAG) const {
30389 EVT VT = Op.getValueType();
30390 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30391
30392 SDLoc DL(Op);
30393 SDValue Val = Op.getOperand(0);
30394 EVT SrcVT = Val.getValueType();
30395 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
30396 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
30398 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
30399
30400 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
30401 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
30402 Op.getOperand(1), DAG.getUNDEF(RoundVT));
30403 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
30404 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
30405
30406 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
30407 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
30408}
30409
30410SDValue
30411AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
30412 SelectionDAG &DAG) const {
30413 EVT VT = Op.getValueType();
30414 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30415
30416 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
30417 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
30418 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
30419
30420 SDLoc DL(Op);
30421 SDValue Val = Op.getOperand(0);
30422 EVT SrcVT = Val.getValueType();
30423 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
30424 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
30425
30426 if (VT.bitsGE(SrcVT)) {
30428
30429 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
30430 VT.changeTypeToInteger(), Val);
30431
30432 // Safe to use a larger than specified operand because by promoting the
30433 // value nothing has changed from an arithmetic point of view.
30434 Val =
30435 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
30436 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
30437 DAG.getUNDEF(ContainerDstVT));
30438 return convertFromScalableVector(DAG, VT, Val);
30439 } else {
30440 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
30441 ContainerDstVT.getVectorElementType());
30443
30444 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
30445 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
30446 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
30447 Val = convertFromScalableVector(DAG, SrcVT, Val);
30448
30449 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
30450 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
30451 }
30452}
30453
30454SDValue
30455AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
30456 SelectionDAG &DAG) const {
30457 SDLoc DL(Op);
30458 EVT OpVT = Op.getValueType();
30459 assert(OpVT.isScalableVector() &&
30460 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
30461
30462 // Are multi-register uzp instructions available?
30463 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
30464 OpVT.getVectorElementType() != MVT::i1) {
30465 Intrinsic::ID IntID;
30466 switch (Op->getNumOperands()) {
30467 default:
30468 return SDValue();
30469 case 2:
30470 IntID = Intrinsic::aarch64_sve_uzp_x2;
30471 break;
30472 case 4:
30473 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
30474 OpVT.getScalarSizeInBits() == 64)
30475 return SDValue();
30476 IntID = Intrinsic::aarch64_sve_uzp_x4;
30477 break;
30478 }
30479
30481 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
30482 Ops.append(Op->op_values().begin(), Op->op_values().end());
30483 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
30484 }
30485
30486 if (Op->getNumOperands() != 2)
30487 return SDValue();
30488
30489 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
30490 Op.getOperand(1));
30491 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
30492 Op.getOperand(1));
30493 return DAG.getMergeValues({Even, Odd}, DL);
30494}
30495
30496SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
30497 SelectionDAG &DAG) const {
30498 SDLoc DL(Op);
30499 EVT OpVT = Op.getValueType();
30500 assert(OpVT.isScalableVector() &&
30501 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
30502
30503 // Are multi-register zip instructions available?
30504 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
30505 OpVT.getVectorElementType() != MVT::i1) {
30506 Intrinsic::ID IntID;
30507 switch (Op->getNumOperands()) {
30508 default:
30509 return SDValue();
30510 case 2:
30511 IntID = Intrinsic::aarch64_sve_zip_x2;
30512 break;
30513 case 4:
30514 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
30515 OpVT.getScalarSizeInBits() == 64)
30516 return SDValue();
30517 IntID = Intrinsic::aarch64_sve_zip_x4;
30518 break;
30519 }
30520
30522 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
30523 Ops.append(Op->op_values().begin(), Op->op_values().end());
30524 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
30525 }
30526
30527 if (Op->getNumOperands() != 2)
30528 return SDValue();
30529
30530 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
30531 Op.getOperand(1));
30532 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
30533 Op.getOperand(1));
30534 return DAG.getMergeValues({Lo, Hi}, DL);
30535}
30536
30537SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
30538 SelectionDAG &DAG) const {
30539 // FIXME: Maybe share some code with LowerMGather/Scatter?
30540 MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
30541 SDLoc DL(HG);
30542 SDValue Chain = HG->getChain();
30543 SDValue Inc = HG->getInc();
30544 SDValue Mask = HG->getMask();
30545 SDValue Ptr = HG->getBasePtr();
30546 SDValue Index = HG->getIndex();
30547 SDValue Scale = HG->getScale();
30548 SDValue IntID = HG->getIntID();
30549
30550 // The Intrinsic ID determines the type of update operation.
30551 [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
30552 // Right now, we only support 'add' as an update.
30553 assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
30554 "Unexpected histogram update operation");
30555
30556 EVT IndexVT = Index.getValueType();
30557 LLVMContext &Ctx = *DAG.getContext();
30558 ElementCount EC = IndexVT.getVectorElementCount();
30559 EVT MemVT = EVT::getVectorVT(Ctx, HG->getMemoryVT(), EC);
30560 EVT IncExtVT =
30561 EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
30562 EVT IncSplatVT = EVT::getVectorVT(Ctx, IncExtVT, EC);
30563 bool ExtTrunc = IncSplatVT != MemVT;
30564
30565 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
30566 SDValue PassThru = DAG.getSplatVector(IncSplatVT, DL, Zero);
30567 SDValue IncSplat = DAG.getSplatVector(
30568 IncSplatVT, DL, DAG.getAnyExtOrTrunc(Inc, DL, IncExtVT));
30569 SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
30570
30571 MachineMemOperand *MMO = HG->getMemOperand();
30572 // Create an MMO for the gather, without load|store flags.
30573 MachineMemOperand *GMMO = DAG.getMachineFunction().getMachineMemOperand(
30575 MMO->getAlign(), MMO->getAAInfo());
30576 ISD::MemIndexType IndexType = HG->getIndexType();
30577 SDValue Gather = DAG.getMaskedGather(
30578 DAG.getVTList(IncSplatVT, MVT::Other), MemVT, DL, Ops, GMMO, IndexType,
30579 ExtTrunc ? ISD::EXTLOAD : ISD::NON_EXTLOAD);
30580
30581 SDValue GChain = Gather.getValue(1);
30582
30583 // Perform the histcnt, multiply by inc, add to bucket data.
30584 SDValue ID =
30585 DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncExtVT);
30586 SDValue HistCnt =
30587 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
30588 SDValue Mul = DAG.getNode(ISD::MUL, DL, IncSplatVT, HistCnt, IncSplat);
30589 SDValue Add = DAG.getNode(ISD::ADD, DL, IncSplatVT, Gather, Mul);
30590
30591 // Create an MMO for the scatter, without load|store flags.
30592 MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
30594 MMO->getAlign(), MMO->getAAInfo());
30595
30596 SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
30597 SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
30598 ScatterOps, SMMO, IndexType, ExtTrunc);
30599 return Scatter;
30600}
30601
30602/// If a PARTIAL_REDUCE_MLA node comes in with an accumulator-input type pairing
30603/// of (nx)v2i64/(nx)v16i8, we cannot directly lower it to a (u|s)dot. We can
30604/// however still make use of the dot product instruction by instead
30605/// accumulating over two steps: (nx)v16i8 -> (nx)v4i32 -> (nx)v2i64.
30606/// If available, make use of the (U|S)ADDW(B|T) instructions, otherwise
30607/// the following pattern is emitted:
30608/// add(add(Acc, ext(EXTRACT_SUBVECTOR(N, 0)), ext(EXTRACT_SUBVECTOR(N,
30609/// NTy/2))))
30610SDValue
30611AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
30612 SelectionDAG &DAG) const {
30613 SDLoc DL(Op);
30614
30615 SDValue Acc = Op.getOperand(0);
30616 SDValue LHS = Op.getOperand(1);
30617 SDValue RHS = Op.getOperand(2);
30618 EVT ResultVT = Op.getValueType();
30619 EVT OrigResultVT = ResultVT;
30620 EVT OpVT = LHS.getValueType();
30621
30622 bool ConvertToScalable =
30623 ResultVT.isFixedLengthVector() &&
30624 useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true);
30625
30626 if (ConvertToScalable) {
30627 ResultVT = getContainerForFixedLengthVector(DAG, ResultVT);
30628 OpVT = getContainerForFixedLengthVector(DAG, LHS.getValueType());
30629 Acc = convertToScalableVector(DAG, ResultVT, Acc);
30630 LHS = convertToScalableVector(DAG, OpVT, LHS);
30631 RHS = convertToScalableVector(DAG, OpVT, RHS);
30632 Op = DAG.getNode(Op.getOpcode(), DL, ResultVT, {Acc, LHS, RHS});
30633 }
30634
30635 // Two-way and four-way partial reductions are supported by patterns.
30636 // We only need to handle the 8-way partial reduction.
30637 if (ResultVT.getScalarType() != MVT::i64 || OpVT.getScalarType() != MVT::i8)
30638 return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Op)
30639 : Op;
30640
30641 EVT DotVT = ResultVT.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
30642 SDValue DotNode = DAG.getNode(Op.getOpcode(), DL, DotVT,
30643 DAG.getConstant(0, DL, DotVT), LHS, RHS);
30644
30645 SDValue Res;
30646 bool IsUnsigned = Op.getOpcode() == ISD::PARTIAL_REDUCE_UMLA;
30647 if (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable()) {
30648 unsigned LoOpcode = IsUnsigned ? AArch64ISD::UADDWB : AArch64ISD::SADDWB;
30649 unsigned HiOpcode = IsUnsigned ? AArch64ISD::UADDWT : AArch64ISD::SADDWT;
30650 SDValue Lo = DAG.getNode(LoOpcode, DL, ResultVT, Acc, DotNode);
30651 Res = DAG.getNode(HiOpcode, DL, ResultVT, Lo, DotNode);
30652 } else {
30653 // Fold (nx)v4i32 into (nx)v2i64
30654 auto [DotNodeLo, DotNodeHi] = DAG.SplitVector(DotNode, DL);
30655 if (IsUnsigned) {
30656 DotNodeLo = DAG.getZExtOrTrunc(DotNodeLo, DL, ResultVT);
30657 DotNodeHi = DAG.getZExtOrTrunc(DotNodeHi, DL, ResultVT);
30658 } else {
30659 DotNodeLo = DAG.getSExtOrTrunc(DotNodeLo, DL, ResultVT);
30660 DotNodeHi = DAG.getSExtOrTrunc(DotNodeHi, DL, ResultVT);
30661 }
30662 auto Lo = DAG.getNode(ISD::ADD, DL, ResultVT, Acc, DotNodeLo);
30663 Res = DAG.getNode(ISD::ADD, DL, ResultVT, Lo, DotNodeHi);
30664 }
30665
30666 return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Res)
30667 : Res;
30668}
30669
30670SDValue
30671AArch64TargetLowering::LowerGET_ACTIVE_LANE_MASK(SDValue Op,
30672 SelectionDAG &DAG) const {
30673 EVT VT = Op.getValueType();
30674 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30675
30676 assert(Subtarget->isSVEorStreamingSVEAvailable() &&
30677 "Lowering fixed length get_active_lane_mask requires SVE!");
30678
30679 // There are no dedicated fixed-length instructions for GET_ACTIVE_LANE_MASK,
30680 // but we can use SVE when available.
30681
30682 SDLoc DL(Op);
30683 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30684 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
30685
30686 SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WhileVT,
30687 Op.getOperand(0), Op.getOperand(1));
30688 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
30689 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
30690 DAG.getVectorIdxConstant(0, DL));
30691}
30692
30693SDValue
30694AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
30695 SelectionDAG &DAG) const {
30696 EVT VT = Op.getValueType();
30697 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30698
30699 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
30700 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
30701 : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
30702
30703 SDLoc DL(Op);
30704 SDValue Val = Op.getOperand(0);
30705 EVT SrcVT = Val.getValueType();
30706 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
30707 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
30708
30709 if (VT.bitsGT(SrcVT)) {
30710 EVT CvtVT = ContainerDstVT.changeVectorElementType(
30711 ContainerSrcVT.getVectorElementType());
30713
30714 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
30715 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
30716
30717 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
30718 Val = getSVESafeBitCast(CvtVT, Val, DAG);
30719 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
30720 DAG.getUNDEF(ContainerDstVT));
30721 return convertFromScalableVector(DAG, VT, Val);
30722 } else {
30723 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
30725
30726 // Safe to use a larger than specified result since an fp_to_int where the
30727 // result doesn't fit into the destination is undefined.
30728 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
30729 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
30730 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
30731
30732 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
30733 }
30734}
30735
30737 ArrayRef<int> ShuffleMask, EVT VT,
30738 EVT ContainerVT, SelectionDAG &DAG) {
30739 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
30740 SDLoc DL(Op);
30741 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
30742 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
30743 bool IsSingleOp =
30744 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
30745
30746 if (!Subtarget.isNeonAvailable() && !MinSVESize)
30747 MinSVESize = 128;
30748
30749 // Ignore two operands if no SVE2 or all index numbers couldn't
30750 // be represented.
30751 if (!IsSingleOp && !Subtarget.hasSVE2())
30752 return SDValue();
30753
30754 EVT VTOp1 = Op.getOperand(0).getValueType();
30755 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
30756 unsigned IndexLen = MinSVESize / BitsPerElt;
30757 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
30758 uint64_t MaxOffset = maxUIntN(BitsPerElt);
30759 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
30760 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
30761 bool MinMaxEqual = (MinSVESize == MaxSVESize);
30762 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
30763 "Incorrectly legalised shuffle operation");
30764
30766 // If MinSVESize is not equal to MaxSVESize then we need to know which
30767 // TBL mask element needs adjustment.
30768 SmallVector<SDValue, 8> AddRuntimeVLMask;
30769
30770 // Bail out for 8-bits element types, because with 2048-bit SVE register
30771 // size 8 bits is only sufficient to index into the first source vector.
30772 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
30773 return SDValue();
30774
30775 for (int Index : ShuffleMask) {
30776 // Handling poison index value.
30777 if (Index < 0)
30778 Index = 0;
30779 // If the mask refers to elements in the second operand, then we have to
30780 // offset the index by the number of elements in a vector. If this is number
30781 // is not known at compile-time, we need to maintain a mask with 'VL' values
30782 // to add at runtime.
30783 if ((unsigned)Index >= ElementsPerVectorReg) {
30784 if (MinMaxEqual) {
30785 Index += IndexLen - ElementsPerVectorReg;
30786 } else {
30787 Index = Index - ElementsPerVectorReg;
30788 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
30789 }
30790 } else if (!MinMaxEqual)
30791 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
30792 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
30793 // to 255, this might point to the last element of in the second operand
30794 // of the shufflevector, thus we are rejecting this transform.
30795 if ((unsigned)Index >= MaxOffset)
30796 return SDValue();
30797 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
30798 }
30799
30800 // Choosing an out-of-range index leads to the lane being zeroed vs zero
30801 // value where it would perform first lane duplication for out of
30802 // index elements. For i8 elements an out-of-range index could be a valid
30803 // for 2048-bit vector register size.
30804 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
30805 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
30806 if (!MinMaxEqual)
30807 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
30808 }
30809
30810 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
30811 SDValue VecMask =
30812 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
30813 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
30814
30815 SDValue Shuffle;
30816 if (IsSingleOp)
30817 Shuffle =
30818 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
30819 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
30820 Op1, SVEMask);
30821 else if (Subtarget.hasSVE2()) {
30822 if (!MinMaxEqual) {
30823 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
30824 SDValue VScale = (BitsPerElt == 64)
30825 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
30826 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
30827 SDValue VecMask =
30828 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
30829 SDValue MulByMask = DAG.getNode(
30830 ISD::MUL, DL, MaskType,
30831 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
30832 DAG.getBuildVector(MaskType, DL,
30833 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
30834 SDValue UpdatedVecMask =
30835 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
30836 SVEMask = convertToScalableVector(
30837 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
30838 }
30839 Shuffle =
30840 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
30841 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
30842 Op1, Op2, SVEMask);
30843 }
30844 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
30845 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
30846}
30847
30848SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
30849 SDValue Op, SelectionDAG &DAG) const {
30850 EVT VT = Op.getValueType();
30851 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30852
30853 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
30854 auto ShuffleMask = SVN->getMask();
30855
30856 SDLoc DL(Op);
30857 SDValue Op1 = Op.getOperand(0);
30858 SDValue Op2 = Op.getOperand(1);
30859
30860 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30861 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
30862 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
30863
30864 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
30865 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
30866 return MVT::i32;
30867 return ScalarTy;
30868 };
30869
30870 if (SVN->isSplat()) {
30871 unsigned Lane = std::max(0, SVN->getSplatIndex());
30872 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
30873 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
30874 DAG.getConstant(Lane, DL, MVT::i64));
30875 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
30876 return convertFromScalableVector(DAG, VT, Op);
30877 }
30878
30879 bool ReverseEXT = false;
30880 unsigned Imm;
30881 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
30882 Imm == VT.getVectorNumElements() - 1) {
30883 if (ReverseEXT)
30884 std::swap(Op1, Op2);
30885 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
30886 SDValue Scalar = DAG.getNode(
30887 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
30888 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
30889 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
30890 return convertFromScalableVector(DAG, VT, Op);
30891 }
30892
30893 unsigned EltSize = VT.getScalarSizeInBits();
30894 for (unsigned BlockSize : {64U, 32U, 16U}) {
30895 if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), BlockSize)) {
30896 unsigned RevOp;
30897 if (EltSize == 8)
30898 RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
30899 else if (EltSize == 16)
30900 RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
30901 else
30902 RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
30903 EVT BlockedVT =
30905 SDValue Pg = getPredicateForVector(DAG, DL, BlockedVT);
30906 SDValue BlockedOp1 = DAG.getNode(ISD::BITCAST, DL, BlockedVT, Op1);
30907 SDValue BlockedRev = DAG.getNode(RevOp, DL, BlockedVT, Pg, BlockedOp1,
30908 DAG.getUNDEF(BlockedVT));
30909 SDValue Container =
30910 DAG.getNode(ISD::BITCAST, DL, ContainerVT, BlockedRev);
30911 return convertFromScalableVector(DAG, VT, Container);
30912 }
30913 }
30914
30915 if (Subtarget->hasSVE2p1() && EltSize == 64 &&
30916 isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
30917 SDValue Pg = getPredicateForVector(DAG, DL, VT);
30918 SDValue Revd = DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, ContainerVT,
30919 Pg, Op1, DAG.getUNDEF(ContainerVT));
30920 return convertFromScalableVector(DAG, VT, Revd);
30921 }
30922
30923 unsigned WhichResult;
30924 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
30925 WhichResult == 0)
30927 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
30928
30929 if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
30930 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
30932 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
30933 }
30934
30935 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
30937 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
30938
30939 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
30940 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
30942 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
30943 }
30944
30945 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
30946 // represents the same logical operation as performed by a ZIP instruction. In
30947 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
30948 // equivalent to an AArch64 instruction. There's the extra component of
30949 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
30950 // only operated on 64/128bit vector types that have a direct mapping to a
30951 // target register and so an exact mapping is implied.
30952 // However, when using SVE for fixed length vectors, most legal vector types
30953 // are actually sub-vectors of a larger SVE register. When mapping
30954 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
30955 // how the mask's indices translate. Specifically, when the mapping requires
30956 // an exact meaning for a specific vector index (e.g. Index X is the last
30957 // vector element in the register) then such mappings are often only safe when
30958 // the exact SVE register size is know. The main exception to this is when
30959 // indices are logically relative to the first element of either
30960 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
30961 // when converting from fixed-length to scalable vector types (i.e. the start
30962 // of a fixed length vector is always the start of a scalable vector).
30963 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
30964 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
30965 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
30966 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
30967 Op2.isUndef()) {
30968 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
30969 return convertFromScalableVector(DAG, VT, Op);
30970 }
30971
30972 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
30973 WhichResult != 0)
30975 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
30976
30977 if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
30978 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
30980 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
30981 }
30982
30983 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
30985 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
30986
30987 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
30988 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
30990 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
30991 }
30992
30993 if ((Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) &&
30994 Subtarget->isSVEorStreamingSVEAvailable()) {
30996 "Unsupported SVE vector size");
30997
30999 unsigned SegmentElts = VT.getVectorNumElements() / Segments;
31000 if (std::optional<unsigned> Lane =
31001 isDUPQMask(ShuffleMask, Segments, SegmentElts)) {
31002 SDValue IID =
31003 DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);
31005 DAG, VT,
31006 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
31007 {IID, Op1,
31008 DAG.getConstant(*Lane, DL, MVT::i64,
31009 /*isTarget=*/true)}));
31010 }
31011 }
31012 }
31013
31014 // Try to widen the shuffle before generating a possibly expensive SVE TBL.
31015 // This may allow the shuffle to be matched as something cheaper like ZIP1.
31016 if (SDValue WideOp = tryWidenMaskForShuffle(Op, DAG))
31017 return WideOp;
31018
31019 // Avoid producing TBL instruction if we don't know SVE register minimal size,
31020 // unless NEON is not available and we can assume minimal SVE register size is
31021 // 128-bits.
31022 if (MinSVESize || !Subtarget->isNeonAvailable())
31023 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
31024 DAG);
31025
31026 return SDValue();
31027}
31028
31029SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
31030 SelectionDAG &DAG) const {
31031 SDLoc DL(Op);
31032 EVT InVT = Op.getValueType();
31033
31034 assert(VT.isScalableVector() && isTypeLegal(VT) &&
31035 InVT.isScalableVector() && isTypeLegal(InVT) &&
31036 "Only expect to cast between legal scalable vector types!");
31037 assert(VT.getVectorElementType() != MVT::i1 &&
31038 InVT.getVectorElementType() != MVT::i1 &&
31039 "For predicate bitcasts, use getSVEPredicateBitCast");
31040
31041 if (InVT == VT)
31042 return Op;
31043
31044 EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
31045 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
31046
31047 // Safe bitcasting between unpacked vector types of different element counts
31048 // is currently unsupported because the following is missing the necessary
31049 // work to ensure the result's elements live where they're supposed to within
31050 // an SVE register.
31051 // 01234567
31052 // e.g. nxv2i32 = XX??XX??
31053 // nxv4f16 = X?X?X?X?
31055 VT == PackedVT || InVT == PackedInVT) &&
31056 "Unexpected bitcast!");
31057
31058 // Pack input if required.
31059 if (InVT != PackedInVT)
31060 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
31061
31062 if (Subtarget->isLittleEndian() ||
31063 PackedVT.getScalarSizeInBits() == PackedInVT.getScalarSizeInBits())
31064 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
31065 else {
31066 EVT PackedVTAsInt = PackedVT.changeTypeToInteger();
31067 EVT PackedInVTAsInt = PackedInVT.changeTypeToInteger();
31068
31069 // Simulate the effect of casting through memory.
31070 Op = DAG.getNode(ISD::BITCAST, DL, PackedInVTAsInt, Op);
31071 if (PackedInVTAsInt.getScalarSizeInBits() != 8)
31072 Op = DAG.getNode(ISD::BSWAP, DL, PackedInVTAsInt, Op);
31073 Op = DAG.getNode(AArch64ISD::NVCAST, DL, PackedVTAsInt, Op);
31074 if (PackedVTAsInt.getScalarSizeInBits() != 8)
31075 Op = DAG.getNode(ISD::BSWAP, DL, PackedVTAsInt, Op);
31076 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
31077 }
31078
31079 // Unpack result if required.
31080 if (VT != PackedVT)
31081 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
31082
31083 return Op;
31084}
31085
31087 SDValue N) const {
31088 return ::isAllActivePredicate(DAG, N);
31089}
31090
31092 return ::getPromotedVTForPredicate(VT);
31093}
31094
31095bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
31096 SDValue Op, const APInt &OriginalDemandedBits,
31097 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
31098 unsigned Depth) const {
31099
31100 unsigned Opc = Op.getOpcode();
31101 switch (Opc) {
31102 case AArch64ISD::VSHL: {
31103 // Match (VSHL (VLSHR Val X) X)
31104 SDValue ShiftL = Op;
31105 SDValue ShiftR = Op->getOperand(0);
31106 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
31107 return false;
31108
31109 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
31110 return false;
31111
31112 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
31113 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
31114
31115 // Other cases can be handled as well, but this is not
31116 // implemented.
31117 if (ShiftRBits != ShiftLBits)
31118 return false;
31119
31120 unsigned ScalarSize = Op.getScalarValueSizeInBits();
31121 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
31122
31123 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
31124 APInt UnusedBits = ~OriginalDemandedBits;
31125
31126 if ((ZeroBits & UnusedBits) != ZeroBits)
31127 return false;
31128
31129 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
31130 // used - simplify to just Val.
31131 return TLO.CombineTo(Op, ShiftR->getOperand(0));
31132 }
31133 case AArch64ISD::BICi: {
31134 // Fold BICi if all destination bits already known to be zeroed
31135 SDValue Op0 = Op.getOperand(0);
31136 KnownBits KnownOp0 =
31137 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
31138 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
31139 APInt BitsToClear =
31140 (Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
31141 .trunc(KnownOp0.getBitWidth());
31142 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
31143 if (BitsToClear.isSubsetOf(AlreadyZeroedBitsToClear))
31144 return TLO.CombineTo(Op, Op0);
31145
31146 Known = KnownOp0 & KnownBits::makeConstant(~BitsToClear);
31147 return false;
31148 }
31150 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
31151 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
31152 if (!MaxSVEVectorSizeInBits)
31153 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
31154 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
31155 // The SVE count intrinsics don't support the multiplier immediate so we
31156 // don't have to account for that here. The value returned may be slightly
31157 // over the true required bits, as this is based on the "ALL" pattern. The
31158 // other patterns are also exposed by these intrinsics, but they all
31159 // return a value that's strictly less than "ALL".
31160 unsigned RequiredBits = llvm::bit_width(MaxElements);
31161 unsigned BitWidth = Known.Zero.getBitWidth();
31162 if (RequiredBits < BitWidth)
31163 Known.Zero.setHighBits(BitWidth - RequiredBits);
31164 return false;
31165 }
31166 }
31167 }
31168
31170 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
31171}
31172
31173bool AArch64TargetLowering::canCreateUndefOrPoisonForTargetNode(
31174 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
31175 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
31176
31177 // TODO: Add more target nodes.
31178 switch (Op.getOpcode()) {
31179 case AArch64ISD::MOVI:
31180 case AArch64ISD::MOVIedit:
31181 case AArch64ISD::MOVImsl:
31182 case AArch64ISD::MOVIshift:
31183 case AArch64ISD::MVNImsl:
31184 case AArch64ISD::MVNIshift:
31185 case AArch64ISD::VASHR:
31186 case AArch64ISD::VLSHR:
31187 case AArch64ISD::VSHL:
31188 return false;
31189 }
31191 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
31192}
31193
31194bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
31195 return Op.getOpcode() == AArch64ISD::DUP ||
31196 Op.getOpcode() == AArch64ISD::MOVI ||
31197 Op.getOpcode() == AArch64ISD::MOVIshift ||
31198 Op.getOpcode() == AArch64ISD::MOVImsl ||
31199 Op.getOpcode() == AArch64ISD::MOVIedit ||
31200 Op.getOpcode() == AArch64ISD::MVNIshift ||
31201 Op.getOpcode() == AArch64ISD::MVNImsl ||
31202 // Ignoring fneg(movi(0)), because if it is folded to FPConstant(-0.0),
31203 // ISel will select fmov(mov i64 0x8000000000000000), resulting in a
31204 // fmov from fpr to gpr, which is more expensive than fneg(movi(0))
31205 (Op.getOpcode() == ISD::FNEG &&
31206 Op.getOperand(0).getOpcode() == AArch64ISD::MOVIedit &&
31207 Op.getOperand(0).getConstantOperandVal(0) == 0) ||
31208 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
31209 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
31211}
31212
31214 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
31215 Subtarget->hasComplxNum();
31216}
31217
31220 auto *VTy = dyn_cast<VectorType>(Ty);
31221 if (!VTy)
31222 return false;
31223
31224 // If the vector is scalable, SVE is enabled, implying support for complex
31225 // numbers. Otherwise, we need to ensure complex number support is available
31226 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
31227 return false;
31228
31229 auto *ScalarTy = VTy->getScalarType();
31230 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
31231
31232 // We can only process vectors that have a bit size of 128 or higher (with an
31233 // additional 64 bits for Neon). Additionally, these vectors must have a
31234 // power-of-2 size, as we later split them into the smallest supported size
31235 // and merging them back together after applying complex operation.
31236 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
31237 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
31238 !llvm::isPowerOf2_32(VTyWidth))
31239 return false;
31240
31241 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
31242 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
31243
31245 return ScalarWidth == 32 || ScalarWidth == 64;
31246 return 8 <= ScalarWidth && ScalarWidth <= 64;
31247 }
31248
31249 // CDot is not supported outside of scalable/sve scopes
31251 return false;
31252
31253 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
31254 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
31255}
31256
31259 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
31260 Value *Accumulator) const {
31261 VectorType *Ty = cast<VectorType>(InputA->getType());
31262 if (Accumulator == nullptr)
31264 bool IsScalable = Ty->isScalableTy();
31265 bool IsInt = Ty->getElementType()->isIntegerTy();
31266
31267 unsigned TyWidth =
31268 Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
31269
31270 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
31271 "Vector type must be either 64 or a power of 2 that is at least 128");
31272
31273 if (TyWidth > 128) {
31274 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
31275 int AccStride = cast<VectorType>(Accumulator->getType())
31276 ->getElementCount()
31277 .getKnownMinValue() /
31278 2;
31279 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
31280 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, uint64_t(0));
31281 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, uint64_t(0));
31282 auto *UpperSplitA = B.CreateExtractVector(HalfTy, InputA, Stride);
31283 auto *UpperSplitB = B.CreateExtractVector(HalfTy, InputB, Stride);
31284 Value *LowerSplitAcc = nullptr;
31285 Value *UpperSplitAcc = nullptr;
31286 Type *FullTy = Ty;
31287 FullTy = Accumulator->getType();
31288 auto *HalfAccTy = VectorType::getHalfElementsVectorType(
31289 cast<VectorType>(Accumulator->getType()));
31290 LowerSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, uint64_t(0));
31291 UpperSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, AccStride);
31292 auto *LowerSplitInt = createComplexDeinterleavingIR(
31293 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
31294 auto *UpperSplitInt = createComplexDeinterleavingIR(
31295 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
31296
31297 auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),
31298 LowerSplitInt, uint64_t(0));
31299 return B.CreateInsertVector(FullTy, Result, UpperSplitInt, AccStride);
31300 }
31301
31302 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
31303 if (IsScalable) {
31304 if (IsInt)
31305 return B.CreateIntrinsic(
31306 Intrinsic::aarch64_sve_cmla_x, Ty,
31307 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
31308
31309 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
31310 return B.CreateIntrinsic(
31311 Intrinsic::aarch64_sve_fcmla, Ty,
31312 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
31313 }
31314
31315 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
31316 Intrinsic::aarch64_neon_vcmla_rot90,
31317 Intrinsic::aarch64_neon_vcmla_rot180,
31318 Intrinsic::aarch64_neon_vcmla_rot270};
31319
31320
31321 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
31322 {Accumulator, InputA, InputB});
31323 }
31324
31325 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
31326 if (IsScalable) {
31329 if (IsInt)
31330 return B.CreateIntrinsic(
31331 Intrinsic::aarch64_sve_cadd_x, Ty,
31332 {InputA, InputB, B.getInt32((int)Rotation * 90)});
31333
31334 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
31335 return B.CreateIntrinsic(
31336 Intrinsic::aarch64_sve_fcadd, Ty,
31337 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
31338 }
31339 return nullptr;
31340 }
31341
31344 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
31346 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
31347
31348 if (IntId == Intrinsic::not_intrinsic)
31349 return nullptr;
31350
31351 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
31352 }
31353
31354 if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
31355 IsScalable) {
31356 return B.CreateIntrinsic(
31357 Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
31358 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
31359 }
31360
31361 return nullptr;
31362}
31363
31364bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
31365 unsigned Opc = N->getOpcode();
31366 if (ISD::isExtOpcode(Opc)) {
31367 if (any_of(N->users(),
31368 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
31369 return false;
31370 }
31371 return true;
31372}
31373
31375 return Subtarget->getMinimumJumpTableEntries();
31376}
31377
31379 CallingConv::ID CC,
31380 EVT VT) const {
31381 bool NonUnitFixedLengthVector =
31383 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
31384 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
31385
31386 EVT VT1;
31387 MVT RegisterVT;
31388 unsigned NumIntermediates;
31389 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
31390 RegisterVT);
31391 return RegisterVT;
31392}
31393
31395 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
31396 bool NonUnitFixedLengthVector =
31398 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
31399 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
31400
31401 EVT VT1;
31402 MVT VT2;
31403 unsigned NumIntermediates;
31404 return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
31405 NumIntermediates, VT2);
31406}
31407
31409 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
31410 unsigned &NumIntermediates, MVT &RegisterVT) const {
31412 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
31413 if (!RegisterVT.isFixedLengthVector() ||
31414 RegisterVT.getFixedSizeInBits() <= 128)
31415 return NumRegs;
31416
31417 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
31418 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
31419 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
31420
31421 // A size mismatch here implies either type promotion or widening and would
31422 // have resulted in scalarisation if larger vectors had not be available.
31423 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
31424 EVT EltTy = VT.getVectorElementType();
31425 EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
31426 if (!isTypeLegal(NewVT))
31427 NewVT = EltTy;
31428
31429 IntermediateVT = NewVT;
31430 NumIntermediates = VT.getVectorNumElements();
31431 RegisterVT = getRegisterType(Context, NewVT);
31432 return NumIntermediates;
31433 }
31434
31435 // SVE VLS support does not introduce a new ABI so we should use NEON sized
31436 // types for vector arguments and returns.
31437
31438 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
31439 NumIntermediates *= NumSubRegs;
31440 NumRegs *= NumSubRegs;
31441
31442 switch (RegisterVT.getVectorElementType().SimpleTy) {
31443 default:
31444 llvm_unreachable("unexpected element type for vector");
31445 case MVT::i8:
31446 IntermediateVT = RegisterVT = MVT::v16i8;
31447 break;
31448 case MVT::i16:
31449 IntermediateVT = RegisterVT = MVT::v8i16;
31450 break;
31451 case MVT::i32:
31452 IntermediateVT = RegisterVT = MVT::v4i32;
31453 break;
31454 case MVT::i64:
31455 IntermediateVT = RegisterVT = MVT::v2i64;
31456 break;
31457 case MVT::f16:
31458 IntermediateVT = RegisterVT = MVT::v8f16;
31459 break;
31460 case MVT::f32:
31461 IntermediateVT = RegisterVT = MVT::v4f32;
31462 break;
31463 case MVT::f64:
31464 IntermediateVT = RegisterVT = MVT::v2f64;
31465 break;
31466 case MVT::bf16:
31467 IntermediateVT = RegisterVT = MVT::v8bf16;
31468 break;
31469 }
31470
31471 return NumRegs;
31472}
31473
31475 const MachineFunction &MF) const {
31476 return !Subtarget->isTargetWindows() &&
31477 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
31478}
31479
31481 switch (Opc) {
31485 if (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)
31486 return true;
31487 }
31488
31490}
31491
31493 EVT VT) const {
31494 return Subtarget->hasCPA() && UseFEATCPACodegen;
31495}
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static SDValue trySVESplat64(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST, APInt &DefBits)
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static SDValue performVectorDeinterleaveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static const MachineInstr * stripVRegCopies(const MachineRegisterInfo &MRI, Register Reg)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue performSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
If the operand is a bitwise AND with a constant RHS, and the shift has a constant RHS and is the only...
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
bool isVectorizedBinOp(unsigned Opcode)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static SDValue emitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &DL, SelectionDAG &DAG)
Emit vector comparison for floating-point values, producing a mask.
static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG)
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static SDValue performExtractLastActiveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static bool shouldLowerTailCallStackArg(const MachineFunction &MF, const CCValAssign &VA, SDValue Arg, ISD::ArgFlagsTy Flags, int CallOffset)
Check whether a stack argument requires lowering in a tail call.
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerPtrAuthGlobalAddressStatically(SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC, SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static bool isCMP(SDValue Op)
return SDValue()
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget, const AtomicRMWInst *RMW)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
unsigned numberOfInstrToLoadImm(APInt C)
static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static bool callConvSupportsVarArgs(CallingConv::ID CC)
Return true if the call convention supports varargs Currently only those that pass varargs like the C...
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static std::optional< std::pair< unsigned, const TargetRegisterClass * > > parseSVERegAsConstraint(StringRef Constraint)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue tryLowerToBSL(SDValue N, SelectionDAG &DAG)
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
constexpr MVT CondCodeVT
Value type used for condition codes.
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static SDValue performSMINCombine(SDNode *N, SelectionDAG &DAG)
SDValue LowerVectorMatch(SDValue Op, SelectionDAG &DAG)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
bool isLegalCmpImmed(APInt C)
static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue foldCSELofLASTB(SDNode *Op, SelectionDAG &DAG)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &DL)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Value * createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *ZExtTy, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC, SDValue RHS={})
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *ST)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtInReg(const SDValue &V)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Tries to replace scalar FP <-> INT conversions with SVE in streaming functions, this can help to redu...
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue getCondCode(SelectionDAG &DAG, AArch64CC::CondCode CC)
Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue optimizeIncrementingWhile(SDNode *N, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
cl::opt< bool > EnableSVEGISel("aarch64-enable-gisel-sve", cl::Hidden, cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false))
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, ISD::CondCode CC, bool NoNaNs, const SDLoc &DL, SelectionDAG &DAG)
For SELECT_CC, when the true/false values are (-1, 0) and the compared values are scalars,...
static SDValue getZT0FrameIndex(MachineFrameInfo &MFI, AArch64FunctionInfo &FuncInfo, SelectionDAG &DAG)
static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static bool shouldBeAdjustedToZero(SDValue LHS, APInt C, ISD::CondCode &CC)
static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static AArch64SME::ToggleCondition getSMToggleCondition(const SMECallAttrs &CallAttrs)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue isNVCastToHalfWidthElements(SDValue V)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static Value * createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *DstTy, bool IsLittleEndian)
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SMECallAttrs getSMECallAttrs(const Function &Caller, const AArch64TargetLowering &TLI, const TargetLowering::CallLoweringInfo &CLI)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, SelectionDAG &DAG, AArch64FunctionInfo *Info, SDLoc DL, SDValue Chain, bool IsSave)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const SDLoc DL)
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static cl::opt< bool > UseFEATCPACodegen("aarch64-use-featcpa-codegen", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in " "SelectionDAG for FEAT_CPA"), cl::init(false))
static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, unsigned NumElts, bool IsLittleEndian, SmallVectorImpl< int > &Mask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
static bool isConstant(const MachineInstr &MI)
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
basic Basic Alias true
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
@ Default
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
mir Rename Register Operands
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
This file provides utility analysis objects describing memory locations.
#define T
This file defines ARC utility functions which are used by various parts of the compiler.
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
static LLVM_ATTRIBUTE_ALWAYS_INLINE MVT::SimpleValueType getSimpleVT(const unsigned char *MatcherTable, unsigned &MatcherIndex)
getSimpleVT - Decode a value in MatcherTable, if it's a VBR encoded value, use GetVBR to decode it.
This file defines the SmallSet class.
This file defines less commonly used SmallVector utilities.
This file defines the SmallVector class.
static bool Enabled
Definition Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
static llvm::Type * getVectorElementType(llvm::Type *Ty)
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
The Input class is used to parse a yaml document into in-memory structs and vectors.
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getMaximumJumpTableSize() const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
bool isStreamingCompatible() const
Returns true if the function has a streaming-compatible body.
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isStreaming() const
Returns true if the function has a streaming body.
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
bool isCallingConvWin64(CallingConv::ID CC, bool IsVarArg) const
unsigned getMinSVEVectorSizeInBits() const
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, bool InsertVectorLengthCheck=false) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock *BB) const
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a stN intrinsic.
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool preferSelectsOverBooleanArithmetic(EVT VT) const override
Should we prefer selects to doing arithmetic on boolean types.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void fixupPtrauthDiscriminator(MachineInstr &MI, MachineBasicBlock *BB, MachineOperand &IntDiscOp, MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const
Replace (0, vreg) discriminator components with the operands of blend or with (immediate,...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a ldN intrinsic.
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
MachineBasicBlock * EmitCheckMatchingVL(MachineInstr &MI, MachineBasicBlock *MBB) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override
Return true if the @llvm.experimental.vector.match intrinsic should be expanded for vector type ‘VT’ ...
MachineBasicBlock * EmitEntryPStateSM(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
In AArch64, true if FEAT_CPA is present.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
MachineBasicBlock * EmitAllocateSMESaveBuffer(MachineInstr &MI, MachineBasicBlock *BB) const
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
MachineBasicBlock * EmitAllocateZABuffer(MachineInstr &MI, MachineBasicBlock *BB) const
const AArch64TargetMachine & getTM() const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
MachineBasicBlock * EmitGetSMESaveSize(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
bool lowerInterleaveIntrinsicToStore(Instruction *Store, Value *Mask, ArrayRef< Value * > InterleaveValues) const override
Lower an interleave intrinsic to a target specific store intrinsic.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
bool useNewSMEABILowering() const
Returns true if the new SME ABI lowering should be used.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:644
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition APInt.cpp:1890
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition APInt.cpp:639
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329
LLVM_ABI APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition APInt.cpp:1928
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition APInt.h:1166
LLVM_ABI APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition APInt.cpp:1935
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
bool isMask(unsigned numBits) const
Definition APInt.h:488
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:334
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
an instruction to allocate memory on the stack
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
@ FAdd
*p = old + v
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ And
*p = old & v
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ Nand
*p = ~(old & v)
bool isFloatingPointOperation() const
BinOp getOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const BlockAddress * getBlockAddress() const
Function * getFunction() const
Definition Constants.h:935
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
LLVM_ABI std::optional< std::pair< APInt, APInt > > isConstantSequence() const
If this BuildVector is constant and represents the numerical series "<a, a+n, a+2n,...
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
LLVM_ABI bool isConstant() const
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:207
bool isBigEndian() const
Definition DataLayout.h:208
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:124
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:187
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
static FixedVectorType * getInteger(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
arg_iterator arg_end()
Definition Function.h:875
arg_iterator arg_begin()
Definition Function.h:866
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
const Argument * const_arg_iterator
Definition Function.h:73
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
bool hasExternalWeakLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:132
Type * getValueType() const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition IRBuilder.h:1936
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2251
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2508
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition IRBuilder.h:552
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getScalableVectorVT(MVT VT, unsigned NumElements)
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MachineInstr * remove_instr(MachineInstr *I)
Remove the possibly bundled instruction from the instruction list without deleting it.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
bool isImmutableObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to an immutable object.
int getStackProtectorIndex() const
Return the index for the stack protector object.
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
LLVM_ABI int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
bool use_empty(Register RegNo) const
use_empty - Return true if there are no instructions using the specified register.
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition MapVector.h:56
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
const SDValue & getInc() const
const SDValue & getScale() const
const SDValue & getMask() const
const SDValue & getIntID() const
const SDValue & getIndex() const
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition Module.cpp:712
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
void dropFlags(unsigned Mask)
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAssert() const
Test if this node is an assert operation.
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasSharedZAInterface() const
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresEnablingZAAfterCall() const
bool requiresPreservingZT0() const
bool requiresDisablingZABeforeCall() const
bool requiresPreservingAllZAState() const
Class to represent scalable SIMD vectors.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:825
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC, bool ConstantFold=true)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType)
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
void addCalledGlobal(const SDNode *Node, const GlobalValue *GV, unsigned OpFlags)
Set CalledGlobal to be associated with Node.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:181
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:472
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition StringRef.h:573
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition StringRef.h:261
StringRef drop_front(size_t N=1) const
Return a StringRef equal to 'this' but with the first N elements dropped.
Definition StringRef.h:611
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition StringRef.h:686
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool shouldExpandBuildVectorWithShuffles(EVT, unsigned DefinedValues) const
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
virtual unsigned getMinimumJumpTableEntries() const
Return lower limit for number of blocks in a jump table.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setPartialReduceMLAAction(unsigned Opc, MVT AccVT, MVT InputVT, LegalizeAction Action)
Indicate how a PARTIAL_REDUCE_U/SMLA node with Acc type AccVT and Input type InputVT should be treate...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool useLoadStackGuardNode(const Module &M) const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
unsigned getPointerSize(unsigned AS) const
Get the pointer size for this target.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned EmitCallGraphSection
Emit section containing call graph metadata.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
LLVM_ABI InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
This class represents a truncation of integer types.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:62
static LLVM_ABI IntegerType * getInt128Ty(LLVMContext &C)
Definition Type.cpp:299
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
@ HalfTyID
16-bit floating point type
Definition Type.h:56
@ FloatTyID
32-bit floating point type
Definition Type.h:58
@ BFloatTyID
16-bit floating point type (7-bit significand)
Definition Type.h:57
@ DoubleTyID
64-bit floating point type
Definition Type.h:59
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:296
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:286
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:285
static LLVM_ABI Type * getBFloatTy(LLVMContext &C)
Definition Type.cpp:284
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:283
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI void dump() const
Support for debugging, callable in GDB: V->dump()
Base class of all SIMD vector types.
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isValidCBCond(AArch64CC::CondCode Code)
True, if a given condition code can be used in a fused compare-and-branch instructions,...
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint64_t decodeAdvSIMDModImmType10(uint8_t Imm)
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isSVECpyDupImm(int SizeInBits, int64_t Val, int32_t &Imm, int32_t &Shift)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
const uint64_t ReservedFPControlBits
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1
Preserve X1-X15, X19-X29, SP, Z0-Z31, P0-P15.
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ PreserveNone
Used for runtime calls that preserves none general registers.
Definition CallingConv.h:90
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNormalMaskedLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed masked load.
bool isNormalMaskedStore(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed masked store.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ LOOP_DEPENDENCE_RAW_MASK
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ TRUNCATE_SSAT_U
Definition ISDOpcodes.h:855
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:809
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor to...
Definition ISDOpcodes.h:622
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition ISDOpcodes.h:682
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:663
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ PtrAuthGlobalAddress
A ptrauth constant.
Definition ISDOpcodes.h:100
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:134
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition ISDOpcodes.h:627
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition ISDOpcodes.h:648
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:690
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:903
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:927
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor ...
Definition ISDOpcodes.h:611
@ TRUNCATE_SSAT_S
TRUNCATE_[SU]SAT_[SU] - Truncate for saturated operand [SU] located in middle, prefix for SAT means i...
Definition ISDOpcodes.h:853
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:713
@ TRUNCATE_USAT_U
Definition ISDOpcodes.h:857
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
@ LOOP_DEPENDENCE_WAR_MASK
Set rounding mode.
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
bool match(Val *V, const Pattern &P)
CastInst_match< OpTy, UIToFPInst > m_UIToFP(const OpTy &Op)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
const unsigned VectorBits
Definition SystemZ.h:154
initializer< Ty > init(const Ty &Val)
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition ObjCARCUtil.h:43
bool attachedCallOpBundleNeedsMarker(const CallBase *CB)
This function determines whether the clang_arc_attachedcall should be emitted with or without the mar...
Definition ObjCARCUtil.h:58
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
bool isPackedVectorType(EVT SomeVT)
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:216
LLVM_ABI void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:294
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:361
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
auto map_to_vector(ContainerTy &&C, FuncTy &&F)
Map a range to a SmallVector with element types deduced from the mapping.
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResult)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> or <1,...
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:252
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1589
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI void reportFatalInternalError(Error Err)
Report a fatal error that indicates a bug in LLVM.
Definition Error.cpp:177
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:348
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
unsigned M1(unsigned Val)
Definition VE.h:377
bool isReleaseOrStronger(AtomicOrdering AO)
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID)
Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
generic_gep_type_iterator<> gep_type_iterator
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:270
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
Definition ModRef.h:68
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
TargetTransformInfo TTI
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
CombineLevel
Definition DAGCombine.h:15
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI VectorType * getDeinterleavedVectorType(IntrinsicInst *DI)
Given a deinterleaveN intrinsic, return the (narrow) vector type of each factor.
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1941
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
gep_type_iterator gep_type_begin(const User *GEP)
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2100
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:257
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2088
static const MachineMemOperand::Flags MOStridedAccess
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:207
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
bool CC_AArch64_Preserve_None(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static const unsigned PerfectShuffleTable[6561+1]
@ Enable
Enable colors.
Definition WithColor.h:47
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:180
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
Helper structure to be able to read SetCC information.
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:324
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
uint64_t getScalarStoreSize() const
Definition ValueTypes.h:402
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:430
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition ValueTypes.h:444
bool isScalableVT() const
Return true if the type is a scalable type.
Definition ValueTypes.h:187
bool isFixedLengthVector() const
Definition ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:292
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
static LLVM_ABI KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:135
static LLVM_ABI KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64