LLVM 22.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
20#include "AArch64Subtarget.h"
24#include "llvm/ADT/APFloat.h"
25#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/ArrayRef.h"
27#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/StringRef.h"
33#include "llvm/ADT/Twine.h"
61#include "llvm/IR/Attributes.h"
62#include "llvm/IR/Constants.h"
63#include "llvm/IR/DataLayout.h"
64#include "llvm/IR/DebugLoc.h"
66#include "llvm/IR/Function.h"
68#include "llvm/IR/GlobalValue.h"
69#include "llvm/IR/IRBuilder.h"
70#include "llvm/IR/Instruction.h"
73#include "llvm/IR/Intrinsics.h"
74#include "llvm/IR/IntrinsicsAArch64.h"
75#include "llvm/IR/Module.h"
77#include "llvm/IR/Type.h"
78#include "llvm/IR/Use.h"
79#include "llvm/IR/Value.h"
84#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <bitset>
96#include <cassert>
97#include <cctype>
98#include <cstdint>
99#include <cstdlib>
100#include <iterator>
101#include <limits>
102#include <optional>
103#include <tuple>
104#include <utility>
105#include <vector>
106
107using namespace llvm;
108
109#define DEBUG_TYPE "aarch64-lower"
110
111STATISTIC(NumTailCalls, "Number of tail calls");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157// TODO: This option should be removed once we switch to always using PTRADD in
158// the SelectionDAG.
160 "aarch64-use-featcpa-codegen", cl::Hidden,
161 cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in "
162 "SelectionDAG for FEAT_CPA"),
163 cl::init(false));
164
165/// Value type used for condition codes.
166constexpr MVT CondCodeVT = MVT::i32;
167
168/// Value type used for NZCV flags.
169constexpr MVT FlagsVT = MVT::i32;
170
171static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
172 AArch64::X3, AArch64::X4, AArch64::X5,
173 AArch64::X6, AArch64::X7};
174static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
175 AArch64::Q3, AArch64::Q4, AArch64::Q5,
176 AArch64::Q6, AArch64::Q7};
177
179
181
182static inline EVT getPackedSVEVectorVT(EVT VT) {
183 switch (VT.getSimpleVT().SimpleTy) {
184 default:
185 llvm_unreachable("unexpected element type for vector");
186 case MVT::i8:
187 return MVT::nxv16i8;
188 case MVT::i16:
189 return MVT::nxv8i16;
190 case MVT::i32:
191 return MVT::nxv4i32;
192 case MVT::i64:
193 return MVT::nxv2i64;
194 case MVT::f16:
195 return MVT::nxv8f16;
196 case MVT::f32:
197 return MVT::nxv4f32;
198 case MVT::f64:
199 return MVT::nxv2f64;
200 case MVT::bf16:
201 return MVT::nxv8bf16;
202 }
203}
204
205// NOTE: Currently there's only a need to return integer vector types. If this
206// changes then just add an extra "type" parameter.
208 switch (EC.getKnownMinValue()) {
209 default:
210 llvm_unreachable("unexpected element count for vector");
211 case 16:
212 return MVT::nxv16i8;
213 case 8:
214 return MVT::nxv8i16;
215 case 4:
216 return MVT::nxv4i32;
217 case 2:
218 return MVT::nxv2i64;
219 }
220}
221
223 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
224 "Expected scalable predicate vector type!");
225 switch (VT.getVectorMinNumElements()) {
226 default:
227 llvm_unreachable("unexpected element count for vector");
228 case 2:
229 return MVT::nxv2i64;
230 case 4:
231 return MVT::nxv4i32;
232 case 8:
233 return MVT::nxv8i16;
234 case 16:
235 return MVT::nxv16i8;
236 }
237}
238
239/// Returns true if VT's elements occupy the lowest bit positions of its
240/// associated register class without any intervening space.
241///
242/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
243/// same register class, but only nxv8f16 can be treated as a packed vector.
244static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
246 "Expected legal vector type!");
247 return VT.isFixedLengthVector() ||
249}
250
251// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
252// predicate and end with a passthru value matching the result type.
253static bool isMergePassthruOpcode(unsigned Opc) {
254 switch (Opc) {
255 default:
256 return false;
257 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
258 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
259 case AArch64ISD::REVH_MERGE_PASSTHRU:
260 case AArch64ISD::REVW_MERGE_PASSTHRU:
261 case AArch64ISD::REVD_MERGE_PASSTHRU:
262 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
263 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
264 case AArch64ISD::DUP_MERGE_PASSTHRU:
265 case AArch64ISD::ABS_MERGE_PASSTHRU:
266 case AArch64ISD::NEG_MERGE_PASSTHRU:
267 case AArch64ISD::FNEG_MERGE_PASSTHRU:
268 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
269 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
270 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
271 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
272 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
273 case AArch64ISD::FRINT_MERGE_PASSTHRU:
274 case AArch64ISD::FROUND_MERGE_PASSTHRU:
275 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
276 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
277 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
278 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
279 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
280 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
281 case AArch64ISD::FCVTX_MERGE_PASSTHRU:
282 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
283 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
284 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
285 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
286 case AArch64ISD::FABS_MERGE_PASSTHRU:
287 return true;
288 }
289}
290
291// Returns true if inactive lanes are known to be zeroed by construction.
293 switch (Op.getOpcode()) {
294 default:
295 return false;
296 // We guarantee i1 splat_vectors to zero the other lanes
298 case ISD::GET_ACTIVE_LANE_MASK:
299 case AArch64ISD::PTRUE:
300 case AArch64ISD::SETCC_MERGE_ZERO:
301 return true;
303 switch (Op.getConstantOperandVal(0)) {
304 default:
305 return false;
306 case Intrinsic::aarch64_sve_ptrue:
307 case Intrinsic::aarch64_sve_pnext:
308 case Intrinsic::aarch64_sve_cmpeq:
309 case Intrinsic::aarch64_sve_cmpne:
310 case Intrinsic::aarch64_sve_cmpge:
311 case Intrinsic::aarch64_sve_cmpgt:
312 case Intrinsic::aarch64_sve_cmphs:
313 case Intrinsic::aarch64_sve_cmphi:
314 case Intrinsic::aarch64_sve_cmpeq_wide:
315 case Intrinsic::aarch64_sve_cmpne_wide:
316 case Intrinsic::aarch64_sve_cmpge_wide:
317 case Intrinsic::aarch64_sve_cmpgt_wide:
318 case Intrinsic::aarch64_sve_cmplt_wide:
319 case Intrinsic::aarch64_sve_cmple_wide:
320 case Intrinsic::aarch64_sve_cmphs_wide:
321 case Intrinsic::aarch64_sve_cmphi_wide:
322 case Intrinsic::aarch64_sve_cmplo_wide:
323 case Intrinsic::aarch64_sve_cmpls_wide:
324 case Intrinsic::aarch64_sve_fcmpeq:
325 case Intrinsic::aarch64_sve_fcmpne:
326 case Intrinsic::aarch64_sve_fcmpge:
327 case Intrinsic::aarch64_sve_fcmpgt:
328 case Intrinsic::aarch64_sve_fcmpuo:
329 case Intrinsic::aarch64_sve_facgt:
330 case Intrinsic::aarch64_sve_facge:
331 case Intrinsic::aarch64_sve_whilege:
332 case Intrinsic::aarch64_sve_whilegt:
333 case Intrinsic::aarch64_sve_whilehi:
334 case Intrinsic::aarch64_sve_whilehs:
335 case Intrinsic::aarch64_sve_whilele:
336 case Intrinsic::aarch64_sve_whilelo:
337 case Intrinsic::aarch64_sve_whilels:
338 case Intrinsic::aarch64_sve_whilelt:
339 case Intrinsic::aarch64_sve_match:
340 case Intrinsic::aarch64_sve_nmatch:
341 case Intrinsic::aarch64_sve_whilege_x2:
342 case Intrinsic::aarch64_sve_whilegt_x2:
343 case Intrinsic::aarch64_sve_whilehi_x2:
344 case Intrinsic::aarch64_sve_whilehs_x2:
345 case Intrinsic::aarch64_sve_whilele_x2:
346 case Intrinsic::aarch64_sve_whilelo_x2:
347 case Intrinsic::aarch64_sve_whilels_x2:
348 case Intrinsic::aarch64_sve_whilelt_x2:
349 return true;
350 }
351 }
352}
353
354static std::tuple<SDValue, SDValue>
356 SDLoc DL(Disc);
357 SDValue AddrDisc;
358 SDValue ConstDisc;
359
360 // If this is a blend, remember the constant and address discriminators.
361 // Otherwise, it's either a constant discriminator, or a non-blended
362 // address discriminator.
363 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
364 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
365 AddrDisc = Disc->getOperand(1);
366 ConstDisc = Disc->getOperand(2);
367 } else {
368 ConstDisc = Disc;
369 }
370
371 // If the constant discriminator (either the blend RHS, or the entire
372 // discriminator value) isn't a 16-bit constant, bail out, and let the
373 // discriminator be computed separately.
374 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
375 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
376 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
377
378 // If there's no address discriminator, use NoRegister, which we'll later
379 // replace with XZR, or directly use a Z variant of the inst. when available.
380 if (!AddrDisc)
381 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
382
383 return std::make_tuple(
384 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
385 AddrDisc);
386}
387
389 const AArch64Subtarget &STI)
390 : TargetLowering(TM, STI), Subtarget(&STI) {
391 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
392 // we have to make something up. Arbitrarily, choose ZeroOrOne.
394 // When comparing vectors the result sets the different elements in the
395 // vector to all-one or all-zero.
397
398 // Set up the register classes.
399 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
400 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
401
402 if (Subtarget->hasLS64()) {
403 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
404 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
405 setOperationAction(ISD::STORE, MVT::i64x8, Custom);
406 }
407
408 if (Subtarget->hasFPARMv8()) {
409 addRegisterClass(MVT::aarch64mfp8, &AArch64::FPR8RegClass);
410 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
411 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
412 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
413 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
414 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
415 }
416
417 if (Subtarget->hasNEON()) {
418 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
419 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
420
421 addDRType(MVT::v2f32);
422 addDRType(MVT::v8i8);
423 addDRType(MVT::v4i16);
424 addDRType(MVT::v2i32);
425 addDRType(MVT::v1i64);
426 addDRType(MVT::v1f64);
427 addDRType(MVT::v4f16);
428 addDRType(MVT::v4bf16);
429
430 addQRType(MVT::v4f32);
431 addQRType(MVT::v2f64);
432 addQRType(MVT::v16i8);
433 addQRType(MVT::v8i16);
434 addQRType(MVT::v4i32);
435 addQRType(MVT::v2i64);
436 addQRType(MVT::v8f16);
437 addQRType(MVT::v8bf16);
438 }
439
440 if (Subtarget->isSVEorStreamingSVEAvailable()) {
441 // Add legal sve predicate types
442 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
443 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
444 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
445 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
446 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
447
448 // Add sve predicate as counter type
449 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
450
451 // Add legal sve data types
452 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
453 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
454 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
455 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
456
457 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
458 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
459 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
460 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
461 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
462 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
463
464 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
465 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
466 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
467
468 if (Subtarget->useSVEForFixedLengthVectors()) {
471 addRegisterClass(VT, &AArch64::ZPRRegClass);
472
475 addRegisterClass(VT, &AArch64::ZPRRegClass);
476 }
477 }
478
479 // Compute derived properties from the register classes
480 computeRegisterProperties(Subtarget->getRegisterInfo());
481
482 // Provide all sorts of operation actions
500 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
501 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
502 setOperationAction(ISD::BR_CC, MVT::i64, Custom);
503 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
504 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
505 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
508 if (Subtarget->hasFPARMv8()) {
511 }
520 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
522 setOperationAction(ISD::BRIND, MVT::Other, Custom);
524
526
530
533
535
536 // Custom lowering hooks are needed for XOR
537 // to fold it into CSINC/CSINV.
540
541 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
542 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
543
544 // Virtually no operation on f128 is legal, but LLVM can't expand them when
545 // there's a valid register class, so we need custom operations in most cases.
546 setOperationAction(ISD::FABS, MVT::f128, Expand);
549 setOperationAction(ISD::FCOS, MVT::f128, Expand);
553 setOperationAction(ISD::FNEG, MVT::f128, Expand);
554 setOperationAction(ISD::FPOW, MVT::f128, Expand);
556 setOperationAction(ISD::FRINT, MVT::f128, Expand);
557 setOperationAction(ISD::FSIN, MVT::f128, Expand);
558 setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
559 setOperationAction(ISD::FSQRT, MVT::f128, Expand);
561 setOperationAction(ISD::FTAN, MVT::f128, Expand);
562 setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
566 setOperationAction(ISD::BR_CC, MVT::f128, Custom);
569 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
570 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
571 // aren't handled.
572
573 // Lowering for many of the conversions is actually specified by the non-f128
574 // type. The LowerXXX function will be trivial when f128 isn't involved.
599 if (Subtarget->hasFPARMv8()) {
602 }
605 if (Subtarget->hasFPARMv8()) {
608 }
611
616
617 // Variable arguments.
618 setOperationAction(ISD::VASTART, MVT::Other, Custom);
619 setOperationAction(ISD::VAARG, MVT::Other, Custom);
620 setOperationAction(ISD::VACOPY, MVT::Other, Custom);
621 setOperationAction(ISD::VAEND, MVT::Other, Expand);
622
623 // Variable-sized objects.
624 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
625 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
626
627 // Lowering Funnel Shifts to EXTR
632
633 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
634
635 // Constant pool entries
637
638 // BlockAddress
640
641 // AArch64 lacks both left-rotate and popcount instructions.
647 }
648
649 // AArch64 doesn't have i32 MULH{S|U}.
652
653 // AArch64 doesn't have {U|S}MUL_LOHI.
658
659 if (Subtarget->hasCSSC()) {
663
665
669
672
677
682 } else {
686
689
692 }
693
699 }
706
707 // Custom lower Add/Sub/Mul with overflow.
720
729
730 setOperationAction(ISD::FSIN, MVT::f32, Expand);
731 setOperationAction(ISD::FSIN, MVT::f64, Expand);
732 setOperationAction(ISD::FCOS, MVT::f32, Expand);
733 setOperationAction(ISD::FCOS, MVT::f64, Expand);
734 setOperationAction(ISD::FPOW, MVT::f32, Expand);
735 setOperationAction(ISD::FPOW, MVT::f64, Expand);
738 if (Subtarget->hasFullFP16()) {
741 } else {
744 }
745
746 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
747 ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
748 ISD::FSINCOSPI, ISD::FMODF, ISD::FACOS,
749 ISD::FASIN, ISD::FATAN, ISD::FATAN2,
750 ISD::FCOSH, ISD::FSINH, ISD::FTANH,
751 ISD::FTAN, ISD::FEXP, ISD::FEXP2,
752 ISD::FEXP10, ISD::FLOG, ISD::FLOG2,
760 setOperationAction(Op, MVT::f16, Promote);
761 setOperationAction(Op, MVT::v4f16, Expand);
762 setOperationAction(Op, MVT::v8f16, Expand);
763 setOperationAction(Op, MVT::bf16, Promote);
764 setOperationAction(Op, MVT::v4bf16, Expand);
765 setOperationAction(Op, MVT::v8bf16, Expand);
766 }
767
768 // Legalize fcanonicalize to circumvent default expansion
769 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
770 if (Subtarget->hasFullFP16()) {
772 }
773
774 // fpextend from f16 or bf16 to f32 is legal
775 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
776 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Legal);
779 // fpextend from bf16 to f64 needs to be split into two fpextends
780 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
782
783 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
784 for (auto Op : {
787 ISD::BR_CC,
788 ISD::FADD,
789 ISD::FSUB,
790 ISD::FMUL,
791 ISD::FDIV,
792 ISD::FMA,
793 ISD::FCEIL,
794 ISD::FSQRT,
795 ISD::FFLOOR,
796 ISD::FNEARBYINT,
797 ISD::FRINT,
798 ISD::FROUND,
799 ISD::FROUNDEVEN,
800 ISD::FTRUNC,
801 ISD::FMINNUM,
802 ISD::FMAXNUM,
803 ISD::FMINIMUM,
804 ISD::FMAXIMUM,
805 ISD::FMINIMUMNUM,
806 ISD::FMAXIMUMNUM,
825 })
826 setOperationAction(Op, ScalarVT, Promote);
827
828 for (auto Op : {ISD::FNEG, ISD::FABS})
829 setOperationAction(Op, ScalarVT, Legal);
830
831 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
832 // because the result type is integer.
833 for (auto Op : {ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
836 setOperationAction(Op, ScalarVT, Custom);
837
838 // promote v4f16 to v4f32 when that is known to be safe.
839 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
840 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
841 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
842 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
843 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
844 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
845 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
846 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
847 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
848 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
849 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
850 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
851 setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
852 setOperationPromotedToType(ISD::SETCC, V4Narrow, MVT::v4f32);
853
854 setOperationAction(ISD::FABS, V4Narrow, Legal);
855 setOperationAction(ISD::FNEG, V4Narrow, Legal);
857 setOperationAction(ISD::BR_CC, V4Narrow, Expand);
861 setOperationAction(ISD::FSQRT, V4Narrow, Expand);
862
863 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
864 setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
865 setOperationPromotedToType(ISD::SETCC, V8Narrow, MVT::v8f32);
866
867 setOperationAction(ISD::FABS, V8Narrow, Legal);
869 setOperationAction(ISD::FCEIL, V8Narrow, Legal);
872 setOperationAction(ISD::FFLOOR, V8Narrow, Legal);
875 setOperationAction(ISD::FNEARBYINT, V8Narrow, Legal);
876 setOperationAction(ISD::FNEG, V8Narrow, Legal);
877 setOperationAction(ISD::FROUND, V8Narrow, Legal);
878 setOperationAction(ISD::FROUNDEVEN, V8Narrow, Legal);
879 setOperationAction(ISD::FRINT, V8Narrow, Legal);
880 setOperationAction(ISD::FSQRT, V8Narrow, Expand);
882 setOperationAction(ISD::FTRUNC, V8Narrow, Legal);
883 setOperationAction(ISD::BR_CC, V8Narrow, Expand);
886 setOperationAction(ISD::FP_EXTEND, V8Narrow, Expand);
887 };
888
889 if (!Subtarget->hasFullFP16()) {
890 LegalizeNarrowFP(MVT::f16);
891 }
892 LegalizeNarrowFP(MVT::bf16);
895
896 // AArch64 has implementations of a lot of rounding-like FP operations.
897 // clang-format off
898 for (auto Op :
899 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
900 ISD::FRINT, ISD::FTRUNC, ISD::FROUND,
901 ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM,
902 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND,
903 ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
904 ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE,
910 for (MVT Ty : {MVT::f32, MVT::f64})
912 if (Subtarget->hasFullFP16())
913 setOperationAction(Op, MVT::f16, Legal);
914 }
915 // clang-format on
916
917 // Basic strict FP operations are legal
920 for (MVT Ty : {MVT::f32, MVT::f64})
922 if (Subtarget->hasFullFP16())
923 setOperationAction(Op, MVT::f16, Legal);
924 }
925
926 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
927
929 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
930 setOperationAction(ISD::GET_FPMODE, MVT::i32, Custom);
931 setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);
932 setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
933
934 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
935 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
936 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);
937 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, LibCall);
938 } else {
939 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
940 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Expand);
941 }
942 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
943 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
944
945 // Generate outline atomics library calls only if LSE was not specified for
946 // subtarget
947 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
948 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
949 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
950 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
951 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
952 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
953 setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
954 setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
955 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
956 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
957 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
958 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
959 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
960 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
961 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
962 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
963 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
964 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
965 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
966 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
967 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
968 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
969 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
970 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
971 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
972 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
973 }
974
975 if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {
976 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f16, LibCall);
977 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f32, LibCall);
978 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f64, LibCall);
979 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::bf16, LibCall);
980
981 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f16, LibCall);
982 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f32, LibCall);
983 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f64, LibCall);
984 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::bf16, LibCall);
985
986 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f16, LibCall);
987 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f32, LibCall);
988 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f64, LibCall);
989 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::bf16, LibCall);
990
991 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f16, LibCall);
992 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f32, LibCall);
993 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f64, LibCall);
994 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::bf16, LibCall);
995
996 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f16, LibCall);
997 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f32, LibCall);
998 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f64, LibCall);
999 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::bf16, LibCall);
1000 }
1001
1002 if (Subtarget->hasLSE128()) {
1003 // Custom lowering because i128 is not legal. Must be replaced by 2x64
1004 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
1005 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom);
1006 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom);
1007 setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom);
1008 }
1009
1010 // 128-bit loads and stores can be done without expanding
1011 setOperationAction(ISD::LOAD, MVT::i128, Custom);
1012 setOperationAction(ISD::STORE, MVT::i128, Custom);
1013
1014 // Aligned 128-bit loads and stores are single-copy atomic according to the
1015 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
1016 if (Subtarget->hasLSE2()) {
1017 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
1018 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
1019 }
1020
1021 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
1022 // custom lowering, as there are no un-paired non-temporal stores and
1023 // legalization will break up 256 bit inputs.
1024 setOperationAction(ISD::STORE, MVT::v32i8, Custom);
1025 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
1026 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
1027 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1028 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
1029 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
1030 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
1031 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
1032
1033 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1034 // custom lowering, as there are no un-paired non-temporal loads legalization
1035 // will break up 256 bit inputs.
1036 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1037 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1038 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1039 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1040 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1041 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1042 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1043 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1044
1045 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1046 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
1047
1048 // Issue __sincos_stret if available.
1049 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1050 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1051
1052 // Make floating-point constants legal for the large code model, so they don't
1053 // become loads from the constant pool.
1054 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1057 }
1058
1059 // AArch64 does not have floating-point extending loads, i1 sign-extending
1060 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1061 for (MVT VT : MVT::fp_valuetypes()) {
1062 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1063 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1064 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1065 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1066 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1067 }
1068 for (MVT VT : MVT::integer_valuetypes())
1069 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1070
1071 for (MVT WideVT : MVT::fp_valuetypes()) {
1072 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1073 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1074 setTruncStoreAction(WideVT, NarrowVT, Expand);
1075 }
1076 }
1077 }
1078
1079 if (Subtarget->hasFPARMv8()) {
1080 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
1081 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
1082 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
1083 }
1084
1085 // Indexed loads and stores are supported.
1086 for (unsigned im = (unsigned)ISD::PRE_INC;
1088 setIndexedLoadAction(im, MVT::i8, Legal);
1089 setIndexedLoadAction(im, MVT::i16, Legal);
1090 setIndexedLoadAction(im, MVT::i32, Legal);
1091 setIndexedLoadAction(im, MVT::i64, Legal);
1092 setIndexedLoadAction(im, MVT::f64, Legal);
1093 setIndexedLoadAction(im, MVT::f32, Legal);
1094 setIndexedLoadAction(im, MVT::f16, Legal);
1095 setIndexedLoadAction(im, MVT::bf16, Legal);
1096 setIndexedStoreAction(im, MVT::i8, Legal);
1097 setIndexedStoreAction(im, MVT::i16, Legal);
1098 setIndexedStoreAction(im, MVT::i32, Legal);
1099 setIndexedStoreAction(im, MVT::i64, Legal);
1100 setIndexedStoreAction(im, MVT::f64, Legal);
1101 setIndexedStoreAction(im, MVT::f32, Legal);
1102 setIndexedStoreAction(im, MVT::f16, Legal);
1103 setIndexedStoreAction(im, MVT::bf16, Legal);
1104 }
1105
1106 // Trap.
1107 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1108 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1109 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
1110
1111 // We combine OR nodes for ccmp operations.
1113 // Try to create BICs for vector ANDs.
1115
1116 // llvm.init.trampoline and llvm.adjust.trampoline
1117 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
1118 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
1119
1120 // Vector add and sub nodes may conceal a high-half opportunity.
1121 // Also, try to fold ADD into CSINC/CSINV..
1124
1127
1128 // Try and combine setcc with csel
1130
1132
1136 ISD::STORE, ISD::BUILD_VECTOR});
1139 setTargetDAGCombine(ISD::LOAD);
1140
1141 setTargetDAGCombine(ISD::MSTORE);
1142
1144
1146
1149 ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
1150
1152 {ISD::MGATHER, ISD::MSCATTER, ISD::EXPERIMENTAL_VECTOR_HISTOGRAM});
1153
1154 setTargetDAGCombine(ISD::FP_EXTEND);
1155
1157
1159
1160 setTargetDAGCombine(ISD::GET_ACTIVE_LANE_MASK);
1161
1162 setTargetDAGCombine(ISD::VECREDUCE_AND);
1163 setTargetDAGCombine(ISD::VECREDUCE_OR);
1164 setTargetDAGCombine(ISD::VECREDUCE_XOR);
1165
1167
1171
1172 // In case of strict alignment, avoid an excessive number of byte wide stores.
1175 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1176
1180 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1181
1184 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1185
1188 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1189
1191
1193
1194 EnableExtLdPromotion = true;
1195
1196 // Set required alignment.
1198 // Set preferred alignments.
1199
1200 // Don't align loops on Windows. The SEH unwind info generation needs to
1201 // know the exact length of functions before the alignments have been
1202 // expanded.
1203 if (!Subtarget->isTargetWindows())
1207
1208 // Only change the limit for entries in a jump table if specified by
1209 // the sub target, but not at the command line.
1210 unsigned MaxJT = STI.getMaximumJumpTableSize();
1211 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1213
1215
1217
1219 if (Subtarget->hasSME())
1221
1222 if (Subtarget->isNeonAvailable()) {
1223 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1224 // silliness like this:
1225 // clang-format off
1226 for (auto Op :
1227 {ISD::SELECT, ISD::SELECT_CC, ISD::FATAN2,
1228 ISD::BR_CC, ISD::FADD, ISD::FSUB,
1230 ISD::FNEG, ISD::FABS, ISD::FCEIL,
1231 ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
1232 ISD::FSIN, ISD::FCOS, ISD::FTAN,
1233 ISD::FASIN, ISD::FACOS, ISD::FATAN,
1234 ISD::FSINH, ISD::FCOSH, ISD::FTANH,
1235 ISD::FPOW, ISD::FLOG, ISD::FLOG2,
1236 ISD::FLOG10, ISD::FEXP, ISD::FEXP2,
1237 ISD::FEXP10, ISD::FRINT, ISD::FROUND,
1238 ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM,
1239 ISD::FMAXNUM, ISD::FMINIMUM, ISD::FMAXIMUM,
1240 ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
1247 setOperationAction(Op, MVT::v1f64, Expand);
1248 // clang-format on
1249
1250 for (auto Op :
1255 setOperationAction(Op, MVT::v1i64, Expand);
1256
1257 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1258 // elements smaller than i32, so promote the input to i32 first.
1259 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1260 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1261
1262 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1263 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1264 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1267 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1269
1270 if (Subtarget->hasFullFP16()) {
1273
1282 } else {
1283 // when AArch64 doesn't have fullfp16 support, promote the input
1284 // to i32 first.
1285 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1286 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1287 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1288 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1289 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1290 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1291 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1292 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1293 }
1294
1295 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1296 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1303 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1308 }
1309
1310 // Custom handling for some quad-vector types to detect MULL.
1311 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1312 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1313 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1314 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1315 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1316 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1317
1318 // Saturates
1319 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64,
1320 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1325 }
1326
1327 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1328 MVT::v4i32}) {
1335 }
1336
1337 // Vector reductions
1338 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1339 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1340 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1341 setOperationAction(ISD::VECREDUCE_FMAX, VT, Legal);
1342 setOperationAction(ISD::VECREDUCE_FMIN, VT, Legal);
1343 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Legal);
1344 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Legal);
1345
1346 setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
1347 }
1348 }
1349 if (Subtarget->hasFullFP16())
1350 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
1351
1352 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1353 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1354 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1355 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1356 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1357 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1358 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1359 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1360 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1361 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1362 }
1363 setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
1364 setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Custom);
1365 setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Custom);
1366 setOperationAction(ISD::VECREDUCE_XOR, MVT::v2i64, Custom);
1367
1369 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1370 // Likewise, narrowing and extending vector loads/stores aren't handled
1371 // directly.
1374
1375 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1378 } else {
1381 }
1384
1387
1388 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1389 setTruncStoreAction(VT, InnerVT, Expand);
1390 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1391 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1392 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1393 }
1394 }
1395
1396 for (auto Op :
1397 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
1398 ISD::FROUND, ISD::FROUNDEVEN, ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
1402 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1404 if (Subtarget->hasFullFP16())
1405 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1407 }
1408
1409 // LRINT and LLRINT.
1410 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1411 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1413 if (Subtarget->hasFullFP16())
1414 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1416 }
1417
1418 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1419
1420 setOperationAction(ISD::BITCAST, MVT::i2, Custom);
1421 setOperationAction(ISD::BITCAST, MVT::i4, Custom);
1422 setOperationAction(ISD::BITCAST, MVT::i8, Custom);
1423 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
1424
1425 setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
1426 setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
1427 setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
1428
1429 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1430 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1431 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1432 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1433 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1434 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1435 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1436 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1437 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1438 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1439 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1440 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1441 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1442 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1443 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1444 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1445 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1446 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1447
1448 // ADDP custom lowering
1449 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1451 // FADDP custom lowering
1452 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1454
1455 if (Subtarget->hasDotProd()) {
1456 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1457 ISD::PARTIAL_REDUCE_UMLA};
1458
1459 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal);
1460 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal);
1461 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v16i8, Custom);
1462 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1463
1464 if (Subtarget->hasMatMulInt8()) {
1465 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v4i32,
1466 MVT::v16i8, Legal);
1467 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i64,
1468 MVT::v16i8, Custom);
1469
1470 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i32,
1471 MVT::v8i8, Legal);
1472 }
1473 }
1474
1475 } else /* !isNeonAvailable */ {
1477 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1479
1480 if (VT.is128BitVector() || VT.is64BitVector()) {
1481 setOperationAction(ISD::LOAD, VT, Legal);
1482 setOperationAction(ISD::STORE, VT, Legal);
1483 setOperationAction(ISD::BITCAST, VT,
1484 Subtarget->isLittleEndian() ? Legal : Expand);
1485 }
1486 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1487 setTruncStoreAction(VT, InnerVT, Expand);
1488 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1489 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1490 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1491 }
1492 }
1493 }
1494
1495 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1499 }
1500
1501 if (Subtarget->hasSME()) {
1503 }
1504
1505 // FIXME: Move lowering for more nodes here if those are common between
1506 // SVE and SME.
1507 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1508 for (auto VT :
1509 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1514 }
1515 for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1516 setOperationAction(ISD::VECTOR_FIND_LAST_ACTIVE, VT, Legal);
1517 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Legal);
1518 }
1519
1520 if (Subtarget->hasSVE2p1() ||
1521 (Subtarget->hasSME2() && Subtarget->isStreaming()))
1522 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, MVT::nxv32i1, Custom);
1523
1524 for (auto VT : {MVT::v16i8, MVT::v8i8, MVT::v4i16, MVT::v2i32})
1525 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Custom);
1526
1527 for (auto VT : {MVT::v8f16, MVT::v4f32, MVT::v2f64})
1529 }
1530
1531 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1532 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1543 setOperationAction(ISD::MLOAD, VT, Custom);
1544 setOperationAction(ISD::MSTORE, VT, Legal);
1564 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1565 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1566 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1567 setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
1568 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1569 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1570 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1571 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1572 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1575
1581
1590
1595
1599
1600 if (!Subtarget->isLittleEndian())
1601 setOperationAction(ISD::BITCAST, VT, Custom);
1602
1603 if (Subtarget->hasSVE2() ||
1604 (Subtarget->hasSME() && Subtarget->isStreaming()))
1605 // For SLI/SRI.
1607 }
1608
1609 // Illegal unpacked integer vector types.
1610 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1613 }
1614
1615 // Type legalize unpacked bitcasts.
1616 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1617 setOperationAction(ISD::BITCAST, VT, Custom);
1618
1619 for (auto VT :
1620 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1621 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1623
1624 // Promote predicate as counter load/stores to standard predicates.
1625 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
1626 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
1627
1628 // Predicate as counter legalization actions.
1629 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
1630 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
1631
1632 for (auto VT :
1633 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1638 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1639 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1640 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1641
1645
1646 // There are no legal MVT::nxv16f## based types.
1647 if (VT != MVT::nxv16i1) {
1652 }
1653 }
1654
1655 // NEON doesn't support masked loads/stores, but SME and SVE do.
1656 for (auto VT :
1657 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1658 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1659 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1660 setOperationAction(ISD::MLOAD, VT, Custom);
1661 setOperationAction(ISD::MSTORE, VT, Custom);
1662 }
1663
1664 // Firstly, exclude all scalable vector extending loads/truncating stores,
1665 // include both integer and floating scalable vector.
1667 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1668 setTruncStoreAction(VT, InnerVT, Expand);
1669 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1670 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1671 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1672 }
1673 }
1674
1675 // Then, selectively enable those which we directly support.
1676 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1677 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1678 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1679 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1680 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1681 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1682 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1683 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1684 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1685 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1686 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1687 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1688 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1689 }
1690
1691 // SVE supports truncating stores of 64 and 128-bit vectors
1692 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1693 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1694 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1695 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1696 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1697
1698 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1699 MVT::nxv4f32, MVT::nxv2f64}) {
1700 setOperationAction(ISD::BITCAST, VT, Custom);
1703 setOperationAction(ISD::MLOAD, VT, Custom);
1711 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1712 setOperationAction(ISD::FMAXNUM, VT, Custom);
1713 setOperationAction(ISD::FMINIMUM, VT, Custom);
1714 setOperationAction(ISD::FMINNUM, VT, Custom);
1716 setOperationAction(ISD::FNEG, VT, Custom);
1718 setOperationAction(ISD::FCEIL, VT, Custom);
1719 setOperationAction(ISD::FFLOOR, VT, Custom);
1720 setOperationAction(ISD::FNEARBYINT, VT, Custom);
1721 setOperationAction(ISD::FRINT, VT, Custom);
1722 setOperationAction(ISD::LRINT, VT, Custom);
1723 setOperationAction(ISD::LLRINT, VT, Custom);
1724 setOperationAction(ISD::FROUND, VT, Custom);
1725 setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1726 setOperationAction(ISD::FTRUNC, VT, Custom);
1727 setOperationAction(ISD::FSQRT, VT, Custom);
1728 setOperationAction(ISD::FABS, VT, Custom);
1729 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1731 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1732 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1733 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1734 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
1735 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
1736 setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
1740
1743 setOperationAction(ISD::FPOW, VT, Expand);
1744 setOperationAction(ISD::FPOWI, VT, Expand);
1745 setOperationAction(ISD::FCOS, VT, Expand);
1746 setOperationAction(ISD::FSIN, VT, Expand);
1747 setOperationAction(ISD::FSINCOS, VT, Expand);
1748 setOperationAction(ISD::FTAN, VT, Expand);
1749 setOperationAction(ISD::FACOS, VT, Expand);
1750 setOperationAction(ISD::FASIN, VT, Expand);
1751 setOperationAction(ISD::FATAN, VT, Expand);
1752 setOperationAction(ISD::FATAN2, VT, Expand);
1753 setOperationAction(ISD::FCOSH, VT, Expand);
1754 setOperationAction(ISD::FSINH, VT, Expand);
1755 setOperationAction(ISD::FTANH, VT, Expand);
1756 setOperationAction(ISD::FEXP, VT, Expand);
1757 setOperationAction(ISD::FEXP2, VT, Expand);
1758 setOperationAction(ISD::FEXP10, VT, Expand);
1759 setOperationAction(ISD::FLOG, VT, Expand);
1760 setOperationAction(ISD::FLOG2, VT, Expand);
1761 setOperationAction(ISD::FLOG10, VT, Expand);
1762
1774 }
1775
1776 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1777 setOperationAction(ISD::BITCAST, VT, Custom);
1779 setOperationAction(ISD::FABS, VT, Custom);
1781 setOperationAction(ISD::FNEG, VT, Custom);
1782 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1784 setOperationAction(ISD::MLOAD, VT, Custom);
1792 }
1793
1794 if (Subtarget->hasSVEB16B16() &&
1795 Subtarget->isNonStreamingSVEorSME2Available()) {
1796 // Note: Use SVE for bfloat16 operations when +sve-b16b16 is available.
1797 for (auto VT : {MVT::v4bf16, MVT::v8bf16, MVT::nxv2bf16, MVT::nxv4bf16,
1798 MVT::nxv8bf16}) {
1801 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1802 setOperationAction(ISD::FMAXNUM, VT, Custom);
1803 setOperationAction(ISD::FMINIMUM, VT, Custom);
1804 setOperationAction(ISD::FMINNUM, VT, Custom);
1807 }
1808 }
1809
1810 for (auto Opcode :
1811 {ISD::FCEIL, ISD::FDIV, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
1812 ISD::FROUND, ISD::FROUNDEVEN, ISD::FSQRT, ISD::FTRUNC, ISD::SETCC,
1813 ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMAXIMUM,
1814 ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMINIMUM}) {
1815 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1816 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1817 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1818 }
1819
1820 if (!Subtarget->hasSVEB16B16() ||
1821 !Subtarget->isNonStreamingSVEorSME2Available()) {
1822 for (MVT VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1823 MVT PromotedVT = VT.changeVectorElementType(MVT::f32);
1824 setOperationPromotedToType(ISD::FADD, VT, PromotedVT);
1825 setOperationPromotedToType(ISD::FMA, VT, PromotedVT);
1826 setOperationPromotedToType(ISD::FMAXIMUM, VT, PromotedVT);
1827 setOperationPromotedToType(ISD::FMAXNUM, VT, PromotedVT);
1828 setOperationPromotedToType(ISD::FMINIMUM, VT, PromotedVT);
1829 setOperationPromotedToType(ISD::FMINNUM, VT, PromotedVT);
1830 setOperationPromotedToType(ISD::FSUB, VT, PromotedVT);
1831
1832 if (VT != MVT::nxv2bf16 && Subtarget->hasBF16())
1834 else
1835 setOperationPromotedToType(ISD::FMUL, VT, PromotedVT);
1836 }
1837
1838 if (Subtarget->hasBF16() && Subtarget->isNeonAvailable())
1839 setOperationAction(ISD::FMUL, MVT::v8bf16, Custom);
1840 }
1841
1844
1845 // A number of operations like MULH and integer divides are not supported by
1846 // NEON but are available in SVE.
1847 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1848 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1853 }
1854
1855 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1856 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1857 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1858
1859 // NOTE: Currently this has to happen after computeRegisterProperties rather
1860 // than the preferred option of combining it with the addRegisterClass call.
1861 if (Subtarget->useSVEForFixedLengthVectors()) {
1864 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1865 addTypeForFixedLengthSVE(VT);
1866 }
1869 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1870 addTypeForFixedLengthSVE(VT);
1871 }
1872
1873 // 64bit results can mean a bigger than NEON input.
1874 for (auto VT : {MVT::v8i8, MVT::v4i16})
1877
1878 // 128bit results imply a bigger than NEON input.
1879 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1881 for (auto VT : {MVT::v8f16, MVT::v4f32, MVT::v8bf16})
1883
1884 // These operations are not supported on NEON but SVE can do them.
1886 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1887 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1888 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1889 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1890 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1891 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1892 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1893 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1894 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1895 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1896 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1897 setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
1898 setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
1899 setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
1900 setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
1901
1902 // Int operations with no NEON support.
1903 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1904 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1907 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1908 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1909 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1910 }
1911
1912 // Use SVE for vectors with more than 2 elements.
1913 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1914 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1915 }
1916
1917 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1918 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1919 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1920 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1921
1922 setOperationAction(ISD::VSCALE, MVT::i32, Custom);
1923
1924 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1926 }
1927
1928 // Handle partial reduction operations
1929 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1930 // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
1931 // Other pairs will default to 'Expand'.
1932 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1933 ISD::PARTIAL_REDUCE_UMLA};
1934 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv8i16, Legal);
1935 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv16i8, Legal);
1936
1937 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv16i8, Custom);
1938
1939 if (Subtarget->hasMatMulInt8()) {
1940 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::nxv4i32,
1941 MVT::nxv16i8, Legal);
1942 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::nxv2i64,
1943 MVT::nxv16i8, Custom);
1944 }
1945
1946 // Wide add types
1947 if (Subtarget->hasSVE2() || Subtarget->hasSME()) {
1948 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv4i32, Legal);
1949 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);
1950 setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);
1951 }
1952
1953 // Handle floating-point partial reduction
1954 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
1955 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, MVT::nxv4f32,
1956 MVT::nxv8f16, Legal);
1957 // We can use SVE2p1 fdot to emulate the fixed-length variant.
1958 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, MVT::v4f32,
1959 MVT::v8f16, Custom);
1960 }
1961 }
1962
1963 // Handle non-aliasing elements mask
1964 if (Subtarget->hasSVE2() ||
1965 (Subtarget->hasSME() && Subtarget->isStreaming())) {
1966 // FIXME: Support wider fixed-length types when msve-vector-bits is used.
1967 for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) {
1970 }
1971 for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) {
1974 }
1975 }
1976
1977 // Handle operations that are only available in non-streaming SVE mode.
1978 if (Subtarget->isSVEAvailable()) {
1979 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1980 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1981 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1982 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1983 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1984 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1985 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1986 setOperationAction(ISD::MGATHER, VT, Custom);
1987 setOperationAction(ISD::MSCATTER, VT, Custom);
1988 }
1989
1990 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1991 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1992 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1993 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1994
1995 // We can lower types that have <vscale x {2|4}> elements to compact.
1996 for (auto VT : {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64,
1997 MVT::nxv2f32, MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16,
1998 MVT::nxv4i32, MVT::nxv4f32}) {
2000 // Use a custom lowering for masked stores that could be a supported
2001 // compressing store. Note: These types still use the normal (Legal)
2002 // lowering for non-compressing masked stores.
2003 setOperationAction(ISD::MSTORE, VT, Custom);
2004 }
2005
2006 // If we have SVE, we can use SVE logic for legal (or smaller than legal)
2007 // NEON vectors in the lowest bits of the SVE register.
2008 for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
2009 MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
2011
2012 // Histcnt is SVE2 only
2013 if (Subtarget->hasSVE2()) {
2014 setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv4i32,
2015 Custom);
2016 setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv2i64,
2017 Custom);
2018
2019 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2020 ISD::PARTIAL_REDUCE_UMLA};
2021 // Must be lowered to SVE instructions.
2022 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
2023 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
2024 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
2025 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
2026 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
2027 setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
2028 }
2029 }
2030
2031 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
2032 // Only required for llvm.aarch64.mops.memset.tag
2034 }
2035
2037
2038 if (Subtarget->hasSVE()) {
2039 setOperationAction(ISD::FLDEXP, MVT::f64, Custom);
2040 setOperationAction(ISD::FLDEXP, MVT::f32, Custom);
2041 setOperationAction(ISD::FLDEXP, MVT::f16, Custom);
2042 setOperationAction(ISD::FLDEXP, MVT::bf16, Custom);
2043 }
2044
2045 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
2046
2047 IsStrictFPEnabled = true;
2049
2050 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2051 // it, but it's just a wrapper around ldexp.
2052 if (Subtarget->isTargetWindows()) {
2053 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2054 if (isOperationExpand(Op, MVT::f32))
2055 setOperationAction(Op, MVT::f32, Promote);
2056 }
2057
2058 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
2059 // isn't legal.
2060 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2061 if (isOperationExpand(Op, MVT::f16))
2062 setOperationAction(Op, MVT::f16, Promote);
2063}
2064
2066 return static_cast<const AArch64TargetMachine &>(getTargetMachine());
2067}
2068
2069void AArch64TargetLowering::addTypeForNEON(MVT VT) {
2070 assert(VT.isVector() && "VT should be a vector type");
2071
2072 if (VT.isFloatingPoint()) {
2074 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
2075 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
2076 }
2077
2078 // Mark vector float intrinsics as expand.
2079 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
2080 setOperationAction(ISD::FSIN, VT, Expand);
2081 setOperationAction(ISD::FCOS, VT, Expand);
2082 setOperationAction(ISD::FTAN, VT, Expand);
2083 setOperationAction(ISD::FASIN, VT, Expand);
2084 setOperationAction(ISD::FACOS, VT, Expand);
2085 setOperationAction(ISD::FATAN, VT, Expand);
2086 setOperationAction(ISD::FATAN2, VT, Expand);
2087 setOperationAction(ISD::FSINH, VT, Expand);
2088 setOperationAction(ISD::FCOSH, VT, Expand);
2089 setOperationAction(ISD::FTANH, VT, Expand);
2090 setOperationAction(ISD::FPOW, VT, Expand);
2091 setOperationAction(ISD::FLOG, VT, Expand);
2092 setOperationAction(ISD::FLOG2, VT, Expand);
2093 setOperationAction(ISD::FLOG10, VT, Expand);
2094 setOperationAction(ISD::FEXP, VT, Expand);
2095 setOperationAction(ISD::FEXP2, VT, Expand);
2096 setOperationAction(ISD::FEXP10, VT, Expand);
2097 }
2098
2099 // But we do support custom-lowering for FCOPYSIGN.
2100 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
2101 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
2102 VT == MVT::v8f16) &&
2103 Subtarget->hasFullFP16()))
2105
2118
2122 for (MVT InnerVT : MVT::all_valuetypes())
2123 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
2124
2125 // CNT supports only B element sizes, then use UADDLP to widen.
2126 if (VT != MVT::v8i8 && VT != MVT::v16i8)
2128
2134
2135 for (unsigned Opcode :
2138 setOperationAction(Opcode, VT, Custom);
2139
2140 if (!VT.isFloatingPoint())
2142
2143 // [SU][MIN|MAX] are available for all NEON types apart from i64.
2144 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
2145 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
2146 setOperationAction(Opcode, VT, Legal);
2147
2148 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
2149 // NEON types.
2150 if (VT.isFloatingPoint() &&
2151 VT.getVectorElementType() != MVT::bf16 &&
2152 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
2153 for (unsigned Opcode :
2154 {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
2155 ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::STRICT_FMINIMUM,
2159 setOperationAction(Opcode, VT, Legal);
2160
2161 // Strict fp extend and trunc are legal
2162 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
2164 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
2166
2167 // FIXME: We could potentially make use of the vector comparison instructions
2168 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
2169 // complications:
2170 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
2171 // so we would need to expand when the condition code doesn't match the
2172 // kind of comparison.
2173 // * Some kinds of comparison require more than one FCMXY instruction so
2174 // would need to be expanded instead.
2175 // * The lowering of the non-strict versions involves target-specific ISD
2176 // nodes so we would likely need to add strict versions of all of them and
2177 // handle them appropriately.
2180
2181 // When little-endian we can use ordinary d and q register loads/stores for
2182 // vector types, but when big-endian we need to use structure load/store which
2183 // only allow post-index addressing.
2184 if (Subtarget->isLittleEndian()) {
2185 for (unsigned im = (unsigned)ISD::PRE_INC;
2186 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
2189 }
2190 } else {
2193 }
2194
2195 if (Subtarget->hasD128()) {
2198 }
2199
2200 if (VT.isInteger()) {
2201 // Let common code emit inverted variants of compares we do support.
2207 }
2208}
2209
2211 EVT OpVT) const {
2212 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2213 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
2214 ResVT.getVectorElementType() != MVT::i1)
2215 return true;
2216
2217 // Only support illegal types if the result is scalable and min elements > 1.
2218 if (ResVT.getVectorMinNumElements() == 1 ||
2219 (ResVT.isFixedLengthVector() && (ResVT.getVectorNumElements() > 16 ||
2220 (OpVT != MVT::i32 && OpVT != MVT::i64))))
2221 return true;
2222
2223 // 32 & 64 bit operands are supported. We can promote anything < 64 bits,
2224 // but anything larger should be expanded.
2225 if (OpVT.getFixedSizeInBits() > 64)
2226 return true;
2227
2228 return false;
2229}
2230
2232 if (!Subtarget->isSVEorStreamingSVEAvailable())
2233 return true;
2234
2235 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2236 // also support fixed-width predicates.
2237 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2238 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2239 VT != MVT::v4i1 && VT != MVT::v2i1;
2240}
2241
2243 unsigned SearchSize) const {
2244 // MATCH is SVE2 and only available in non-streaming mode.
2245 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2246 return true;
2247 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2248 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2249 return SearchSize != 8;
2250 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2251 return SearchSize != 8 && SearchSize != 16;
2252 return true;
2253}
2254
2255void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2256 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2257
2258 // By default everything must be expanded.
2259 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2261
2262 if (VT.isFloatingPoint()) {
2272 }
2273
2275 VT == MVT::v1f64 ? Expand : Custom;
2276
2277 // Mark integer truncating stores/extending loads as having custom lowering
2278 if (VT.isInteger()) {
2279 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2280 while (InnerVT != VT) {
2281 setTruncStoreAction(VT, InnerVT, Default);
2282 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2283 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2284 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2285 InnerVT = InnerVT.changeVectorElementType(
2286 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2287 }
2288 }
2289
2290 // Mark floating-point truncating stores/extending loads as having custom
2291 // lowering
2292 if (VT.isFloatingPoint()) {
2293 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2294 while (InnerVT != VT) {
2295 setTruncStoreAction(VT, InnerVT, Custom);
2296 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2297 InnerVT = InnerVT.changeVectorElementType(
2299 }
2300 }
2301
2302 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2303 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2304
2305 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2306 ISD::PARTIAL_REDUCE_UMLA};
2307 unsigned NumElts = VT.getVectorNumElements();
2308 if (VT.getVectorElementType() == MVT::i64) {
2309 setPartialReduceMLAAction(MLAOps, VT,
2310 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2311 setPartialReduceMLAAction(MLAOps, VT,
2312 MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
2313 setPartialReduceMLAAction(MLAOps, VT,
2314 MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
2315 } else if (VT.getVectorElementType() == MVT::i32) {
2316 setPartialReduceMLAAction(MLAOps, VT,
2317 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2318 setPartialReduceMLAAction(MLAOps, VT,
2319 MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
2320 } else if (VT.getVectorElementType() == MVT::i16) {
2321 setPartialReduceMLAAction(MLAOps, VT,
2322 MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
2323 }
2324 if (Subtarget->hasMatMulInt8()) {
2325 if (VT.getVectorElementType() == MVT::i32)
2326 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
2327 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2328 else if (VT.getVectorElementType() == MVT::i64)
2329 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
2330 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2331 }
2332
2333 if (Subtarget->hasSVE2p1() && VT.getVectorElementType() == MVT::f32) {
2334 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, VT,
2335 MVT::getVectorVT(MVT::f16, NumElts * 2), Custom);
2336 }
2337
2338 // Lower fixed length vector operations to scalable equivalents.
2345 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2355 setOperationAction(ISD::FABS, VT, Default);
2357 setOperationAction(ISD::FCEIL, VT, Default);
2360 setOperationAction(ISD::FFLOOR, VT, Default);
2362 setOperationAction(ISD::FMAXIMUM, VT, Default);
2363 setOperationAction(ISD::FMAXNUM, VT, Default);
2364 setOperationAction(ISD::FMINIMUM, VT, Default);
2365 setOperationAction(ISD::FMINNUM, VT, Default);
2367 setOperationAction(ISD::FNEARBYINT, VT, Default);
2368 setOperationAction(ISD::FNEG, VT, Default);
2369 setOperationAction(ISD::FP_EXTEND, VT, Default);
2373 setOperationAction(ISD::FRINT, VT, Default);
2374 setOperationAction(ISD::LRINT, VT, Default);
2375 setOperationAction(ISD::LLRINT, VT, Default);
2376 setOperationAction(ISD::FROUND, VT, Default);
2377 setOperationAction(ISD::FROUNDEVEN, VT, Default);
2378 setOperationAction(ISD::FSQRT, VT, Default);
2380 setOperationAction(ISD::FTRUNC, VT, Default);
2381 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Default);
2383 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2384 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2385 setOperationAction(ISD::MLOAD, VT, Default);
2386 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2387 setOperationAction(ISD::MSTORE, VT, Default);
2405 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2412 setOperationAction(ISD::VECREDUCE_ADD, VT, Default);
2413 setOperationAction(ISD::VECREDUCE_AND, VT, Default);
2414 setOperationAction(ISD::VECREDUCE_FADD, VT, Default);
2415 setOperationAction(ISD::VECREDUCE_FMAX, VT, Default);
2416 setOperationAction(ISD::VECREDUCE_FMIN, VT, Default);
2417 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Default);
2418 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Default);
2419 setOperationAction(ISD::VECREDUCE_OR, VT, Default);
2420 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, PreferSVE ? Default : Expand);
2421 setOperationAction(ISD::VECREDUCE_SMAX, VT, Default);
2422 setOperationAction(ISD::VECREDUCE_SMIN, VT, Default);
2423 setOperationAction(ISD::VECREDUCE_UMAX, VT, Default);
2424 setOperationAction(ISD::VECREDUCE_UMIN, VT, Default);
2425 setOperationAction(ISD::VECREDUCE_XOR, VT, Default);
2431}
2432
2433void AArch64TargetLowering::addDRType(MVT VT) {
2434 addRegisterClass(VT, &AArch64::FPR64RegClass);
2435 if (Subtarget->isNeonAvailable())
2436 addTypeForNEON(VT);
2437}
2438
2439void AArch64TargetLowering::addQRType(MVT VT) {
2440 addRegisterClass(VT, &AArch64::FPR128RegClass);
2441 if (Subtarget->isNeonAvailable())
2442 addTypeForNEON(VT);
2443}
2444
2446 LLVMContext &C, EVT VT) const {
2447 if (!VT.isVector())
2448 return MVT::i32;
2449 if (VT.isScalableVector())
2450 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2452}
2453
2454// isIntImmediate - This method tests to see if the node is a constant
2455// operand. If so Imm will receive the value.
2456static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2458 Imm = C->getZExtValue();
2459 return true;
2460 }
2461 return false;
2462}
2463
2464bool isVectorizedBinOp(unsigned Opcode) {
2465 switch (Opcode) {
2466 case AArch64ISD::SQDMULH:
2467 return true;
2468 default:
2469 return false;
2470 }
2471}
2472
2473// isOpcWithIntImmediate - This method tests to see if the node is a specific
2474// opcode and that it has a immediate integer right operand.
2475// If so Imm will receive the value.
2476static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2477 uint64_t &Imm) {
2478 return N->getOpcode() == Opc &&
2479 isIntImmediate(N->getOperand(1).getNode(), Imm);
2480}
2481
2482static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2483 const APInt &Demanded,
2485 unsigned NewOpc) {
2486 uint64_t OldImm = Imm, NewImm, Enc;
2487 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2488
2489 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2490 // bimm64.
2491 if (Imm == 0 || Imm == Mask ||
2493 return false;
2494
2495 unsigned EltSize = Size;
2496 uint64_t DemandedBits = Demanded.getZExtValue();
2497
2498 // Clear bits that are not demanded.
2499 Imm &= DemandedBits;
2500
2501 while (true) {
2502 // The goal here is to set the non-demanded bits in a way that minimizes
2503 // the number of switching between 0 and 1. In order to achieve this goal,
2504 // we set the non-demanded bits to the value of the preceding demanded bits.
2505 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2506 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2507 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2508 // The final result is 0b11000011.
2509 uint64_t NonDemandedBits = ~DemandedBits;
2510 uint64_t InvertedImm = ~Imm & DemandedBits;
2511 uint64_t RotatedImm =
2512 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2513 NonDemandedBits;
2514 uint64_t Sum = RotatedImm + NonDemandedBits;
2515 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2516 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2517 NewImm = (Imm | Ones) & Mask;
2518
2519 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2520 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2521 // we halve the element size and continue the search.
2522 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2523 break;
2524
2525 // We cannot shrink the element size any further if it is 2-bits.
2526 if (EltSize == 2)
2527 return false;
2528
2529 EltSize /= 2;
2530 Mask >>= EltSize;
2531 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2532
2533 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2534 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2535 return false;
2536
2537 // Merge the upper and lower halves of Imm and DemandedBits.
2538 Imm |= Hi;
2539 DemandedBits |= DemandedBitsHi;
2540 }
2541
2542 ++NumOptimizedImms;
2543
2544 // Replicate the element across the register width.
2545 while (EltSize < Size) {
2546 NewImm |= NewImm << EltSize;
2547 EltSize *= 2;
2548 }
2549
2550 (void)OldImm;
2551 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2552 "demanded bits should never be altered");
2553 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2554
2555 // Create the new constant immediate node.
2556 EVT VT = Op.getValueType();
2557 SDLoc DL(Op);
2558 SDValue New;
2559
2560 // If the new constant immediate is all-zeros or all-ones, let the target
2561 // independent DAG combine optimize this node.
2562 if (NewImm == 0 || NewImm == OrigMask) {
2563 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2564 TLO.DAG.getConstant(NewImm, DL, VT));
2565 // Otherwise, create a machine node so that target independent DAG combine
2566 // doesn't undo this optimization.
2567 } else {
2569 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2570 New = SDValue(
2571 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2572 }
2573
2574 return TLO.CombineTo(Op, New);
2575}
2576
2578 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2579 TargetLoweringOpt &TLO) const {
2580 // Delay this optimization to as late as possible.
2581 if (!TLO.LegalOps)
2582 return false;
2583
2585 return false;
2586
2587 EVT VT = Op.getValueType();
2588 if (VT.isVector())
2589 return false;
2590
2591 unsigned Size = VT.getSizeInBits();
2592
2593 if (Size != 32 && Size != 64)
2594 return false;
2595
2596 // Exit early if we demand all bits.
2597 if (DemandedBits.isAllOnes())
2598 return false;
2599
2600 unsigned NewOpc;
2601 switch (Op.getOpcode()) {
2602 default:
2603 return false;
2604 case ISD::AND:
2605 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2606 break;
2607 case ISD::OR:
2608 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2609 break;
2610 case ISD::XOR:
2611 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2612 break;
2613 }
2614 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2615 if (!C)
2616 return false;
2617 uint64_t Imm = C->getZExtValue();
2618 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2619}
2620
2621/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2622/// Mask are known to be either zero or one and return them Known.
2624 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2625 const SelectionDAG &DAG, unsigned Depth) const {
2626 switch (Op.getOpcode()) {
2627 default:
2628 break;
2629 case AArch64ISD::DUP: {
2630 SDValue SrcOp = Op.getOperand(0);
2631 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2632 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2633 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2634 "Expected DUP implicit truncation");
2635 Known = Known.trunc(Op.getScalarValueSizeInBits());
2636 }
2637 break;
2638 }
2639 case AArch64ISD::CSEL: {
2640 KnownBits Known2;
2641 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2642 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2643 Known = Known.intersectWith(Known2);
2644 break;
2645 }
2646 case AArch64ISD::CSNEG:
2647 case AArch64ISD::CSINC:
2648 case AArch64ISD::CSINV: {
2649 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2650 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2651
2652 // The result is either:
2653 // CSINC: KnownOp0 or KnownOp1 + 1
2654 // CSINV: KnownOp0 or ~KnownOp1
2655 // CSNEG: KnownOp0 or KnownOp1 * -1
2656 if (Op.getOpcode() == AArch64ISD::CSINC)
2657 KnownOp1 = KnownBits::add(
2658 KnownOp1,
2659 KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1)));
2660 else if (Op.getOpcode() == AArch64ISD::CSINV)
2661 std::swap(KnownOp1.Zero, KnownOp1.One);
2662 else if (Op.getOpcode() == AArch64ISD::CSNEG)
2663 KnownOp1 =
2665 Op.getScalarValueSizeInBits())));
2666
2667 Known = KnownOp0.intersectWith(KnownOp1);
2668 break;
2669 }
2670 case AArch64ISD::BICi: {
2671 // Compute the bit cleared value.
2672 APInt Mask =
2673 ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2674 .trunc(Known.getBitWidth());
2675 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2676 Known &= KnownBits::makeConstant(Mask);
2677 break;
2678 }
2679 case AArch64ISD::VLSHR: {
2680 KnownBits Known2;
2681 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2682 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2683 Known = KnownBits::lshr(Known, Known2);
2684 break;
2685 }
2686 case AArch64ISD::VASHR: {
2687 KnownBits Known2;
2688 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2689 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2690 Known = KnownBits::ashr(Known, Known2);
2691 break;
2692 }
2693 case AArch64ISD::VSHL: {
2694 KnownBits Known2;
2695 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2696 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2697 Known = KnownBits::shl(Known, Known2);
2698 break;
2699 }
2700 case AArch64ISD::MOVI: {
2702 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2703 break;
2704 }
2705 case AArch64ISD::MOVIshift: {
2707 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)
2708 << Op->getConstantOperandVal(1)));
2709 break;
2710 }
2711 case AArch64ISD::MOVImsl: {
2712 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2714 Known.getBitWidth(), ~(~Op->getConstantOperandVal(0) << ShiftAmt)));
2715 break;
2716 }
2717 case AArch64ISD::MOVIedit: {
2719 Known.getBitWidth(),
2720 AArch64_AM::decodeAdvSIMDModImmType10(Op->getConstantOperandVal(0))));
2721 break;
2722 }
2723 case AArch64ISD::MVNIshift: {
2725 APInt(Known.getBitWidth(),
2726 ~(Op->getConstantOperandVal(0) << Op->getConstantOperandVal(1)),
2727 /*isSigned*/ false, /*implicitTrunc*/ true));
2728 break;
2729 }
2730 case AArch64ISD::MVNImsl: {
2731 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2733 APInt(Known.getBitWidth(), (~Op->getConstantOperandVal(0) << ShiftAmt),
2734 /*isSigned*/ false, /*implicitTrunc*/ true));
2735 break;
2736 }
2737 case AArch64ISD::LOADgot:
2738 case AArch64ISD::ADDlow: {
2739 if (!Subtarget->isTargetILP32())
2740 break;
2741 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2742 Known.Zero = APInt::getHighBitsSet(64, 32);
2743 break;
2744 }
2745 case AArch64ISD::ASSERT_ZEXT_BOOL: {
2746 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2747 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2748 break;
2749 }
2751 Intrinsic::ID IntID =
2752 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2753 switch (IntID) {
2754 default: return;
2755 case Intrinsic::aarch64_ldaxr:
2756 case Intrinsic::aarch64_ldxr: {
2757 unsigned BitWidth = Known.getBitWidth();
2758 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2759 unsigned MemBits = VT.getScalarSizeInBits();
2760 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2761 return;
2762 }
2763 }
2764 break;
2765 }
2767 case ISD::INTRINSIC_VOID: {
2768 unsigned IntNo = Op.getConstantOperandVal(0);
2769 switch (IntNo) {
2770 default:
2771 break;
2772 case Intrinsic::aarch64_neon_uaddlv: {
2773 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2774 unsigned BitWidth = Known.getBitWidth();
2775 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2776 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2777 assert(BitWidth >= Bound && "Unexpected width!");
2779 Known.Zero |= Mask;
2780 }
2781 break;
2782 }
2783 case Intrinsic::aarch64_neon_umaxv:
2784 case Intrinsic::aarch64_neon_uminv: {
2785 // Figure out the datatype of the vector operand. The UMINV instruction
2786 // will zero extend the result, so we can mark as known zero all the
2787 // bits larger than the element datatype. 32-bit or larget doesn't need
2788 // this as those are legal types and will be handled by isel directly.
2789 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2790 unsigned BitWidth = Known.getBitWidth();
2791 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2792 assert(BitWidth >= 8 && "Unexpected width!");
2794 Known.Zero |= Mask;
2795 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2796 assert(BitWidth >= 16 && "Unexpected width!");
2798 Known.Zero |= Mask;
2799 }
2800 break;
2801 } break;
2802 }
2803 }
2804 }
2805}
2806
2808 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2809 unsigned Depth) const {
2810 EVT VT = Op.getValueType();
2811 unsigned VTBits = VT.getScalarSizeInBits();
2812 unsigned Opcode = Op.getOpcode();
2813 switch (Opcode) {
2814 case AArch64ISD::FCMEQ:
2815 case AArch64ISD::FCMGE:
2816 case AArch64ISD::FCMGT:
2817 // Compares return either 0 or all-ones
2818 return VTBits;
2819 case AArch64ISD::VASHR: {
2820 unsigned Tmp =
2821 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2822 return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2823 }
2824 }
2825
2826 return 1;
2827}
2828
2830 EVT) const {
2831 return MVT::i64;
2832}
2833
2835 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2836 unsigned *Fast) const {
2837
2838 // Allow SVE loads/stores where the alignment >= the size of the element type,
2839 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2840 // for stores that come from IR, only require element-size alignment (even if
2841 // unaligned accesses are disabled). Without this, these will be forced to
2842 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2843 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2844 if (VT.isScalableVector()) {
2845 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2846 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2847 return true;
2848 }
2849
2850 if (Subtarget->requiresStrictAlign())
2851 return false;
2852
2853 if (Fast) {
2854 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2855 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2856 // See comments in performSTORECombine() for more details about
2857 // these conditions.
2858
2859 // Code that uses clang vector extensions can mark that it
2860 // wants unaligned accesses to be treated as fast by
2861 // underspecifying alignment to be 1 or 2.
2862 Alignment <= 2 ||
2863
2864 // Disregard v2i64. Memcpy lowering produces those and splitting
2865 // them regresses performance on micro-benchmarks and olden/bh.
2866 VT == MVT::v2i64;
2867 }
2868 return true;
2869}
2870
2871// Same as above but handling LLTs instead.
2873 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2874 unsigned *Fast) const {
2875 if (Subtarget->requiresStrictAlign())
2876 return false;
2877
2878 if (Fast) {
2879 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2880 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2881 Ty.getSizeInBytes() != 16 ||
2882 // See comments in performSTORECombine() for more details about
2883 // these conditions.
2884
2885 // Code that uses clang vector extensions can mark that it
2886 // wants unaligned accesses to be treated as fast by
2887 // underspecifying alignment to be 1 or 2.
2888 Alignment <= 2 ||
2889
2890 // Disregard v2i64. Memcpy lowering produces those and splitting
2891 // them regresses performance on micro-benchmarks and olden/bh.
2892 Ty == LLT::fixed_vector(2, 64);
2893 }
2894 return true;
2895}
2896
2897FastISel *
2899 const TargetLibraryInfo *libInfo) const {
2900 return AArch64::createFastISel(funcInfo, libInfo);
2901}
2902
2905 MachineBasicBlock *MBB) const {
2906 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2907 // phi node:
2908
2909 // OrigBB:
2910 // [... previous instrs leading to comparison ...]
2911 // b.ne TrueBB
2912 // b EndBB
2913 // TrueBB:
2914 // ; Fallthrough
2915 // EndBB:
2916 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2917
2918 MachineFunction *MF = MBB->getParent();
2919 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2920 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2921 DebugLoc DL = MI.getDebugLoc();
2922 MachineFunction::iterator It = ++MBB->getIterator();
2923
2924 Register DestReg = MI.getOperand(0).getReg();
2925 Register IfTrueReg = MI.getOperand(1).getReg();
2926 Register IfFalseReg = MI.getOperand(2).getReg();
2927 unsigned CondCode = MI.getOperand(3).getImm();
2928 bool NZCVKilled = MI.getOperand(4).isKill();
2929
2930 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2931 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2932 MF->insert(It, TrueBB);
2933 MF->insert(It, EndBB);
2934
2935 // Transfer rest of current basic-block to EndBB
2936 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2937 MBB->end());
2939
2940 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2941 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2942 MBB->addSuccessor(TrueBB);
2943 MBB->addSuccessor(EndBB);
2944
2945 // TrueBB falls through to the end.
2946 TrueBB->addSuccessor(EndBB);
2947
2948 if (!NZCVKilled) {
2949 TrueBB->addLiveIn(AArch64::NZCV);
2950 EndBB->addLiveIn(AArch64::NZCV);
2951 }
2952
2953 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2954 .addReg(IfTrueReg)
2955 .addMBB(TrueBB)
2956 .addReg(IfFalseReg)
2957 .addMBB(MBB);
2958
2959 MI.eraseFromParent();
2960 return EndBB;
2961}
2962
2970
2973 MachineBasicBlock *MBB) const {
2974 MachineFunction &MF = *MBB->getParent();
2975 MachineBasicBlock::iterator MBBI = MI.getIterator();
2976 const AArch64InstrInfo &TII =
2977 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2978 Register TargetReg = MI.getOperand(0).getReg();
2980 TII.probedStackAlloc(MBBI, TargetReg, false);
2981
2982 MI.eraseFromParent();
2983 return NextInst->getParent();
2984}
2985
2988 MachineBasicBlock *MBB) const {
2989 MachineFunction *MF = MBB->getParent();
2991
2992 const TargetRegisterClass *RC_GPR = &AArch64::GPR64RegClass;
2993 const TargetRegisterClass *RC_GPRsp = &AArch64::GPR64spRegClass;
2994
2995 Register RegVL_GPR = MRI.createVirtualRegister(RC_GPR);
2996 Register RegVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL src
2997 Register RegSVL_GPR = MRI.createVirtualRegister(RC_GPR);
2998 Register RegSVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL dst
2999
3000 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3001 DebugLoc DL = MI.getDebugLoc();
3002
3003 // RDVL requires GPR64, ADDSVL requires GPR64sp
3004 // We need to insert COPY instructions, these will later be removed by the
3005 // RegisterCoalescer
3006 BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL_GPR).addImm(1);
3007 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegVL_GPRsp)
3008 .addReg(RegVL_GPR);
3009
3010 BuildMI(*MBB, MI, DL, TII->get(AArch64::ADDSVL_XXI), RegSVL_GPRsp)
3011 .addReg(RegVL_GPRsp)
3012 .addImm(-1);
3013 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegSVL_GPR)
3014 .addReg(RegSVL_GPRsp);
3015
3016 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
3017 MachineFunction::iterator It = ++MBB->getIterator();
3018 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB);
3019 MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB);
3020 MF->insert(It, TrapBB);
3021 MF->insert(It, PassBB);
3022
3023 // Continue if vector lengths match
3024 BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX))
3025 .addReg(RegSVL_GPR)
3026 .addMBB(PassBB);
3027
3028 // Transfer rest of current BB to PassBB
3029 PassBB->splice(PassBB->begin(), MBB,
3030 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
3032
3033 // Trap if vector lengths mismatch
3034 BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1);
3035
3036 MBB->addSuccessor(TrapBB);
3037 MBB->addSuccessor(PassBB);
3038
3039 MI.eraseFromParent();
3040 return PassBB;
3041}
3042
3044AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
3046 MachineBasicBlock *BB) const {
3047 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3048 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3049
3050 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
3051 MIB.add(MI.getOperand(1)); // slice index register
3052 MIB.add(MI.getOperand(2)); // slice index offset
3053 MIB.add(MI.getOperand(3)); // pg
3054 MIB.add(MI.getOperand(4)); // base
3055 MIB.add(MI.getOperand(5)); // offset
3056
3057 MI.eraseFromParent(); // The pseudo is gone now.
3058 return BB;
3059}
3060
3063 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3065 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
3066
3067 MIB.addReg(AArch64::ZA, RegState::Define);
3068 MIB.add(MI.getOperand(0)); // Vector select register
3069 MIB.add(MI.getOperand(1)); // Vector select offset
3070 MIB.add(MI.getOperand(2)); // Base
3071 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
3072
3073 MI.eraseFromParent(); // The pseudo is gone now.
3074 return BB;
3075}
3076
3079 unsigned Opcode,
3080 bool Op0IsDef) const {
3081 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3083
3084 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
3085 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
3086 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
3087 MIB.add(MI.getOperand(I));
3088
3089 MI.eraseFromParent(); // The pseudo is gone now.
3090 return BB;
3091}
3092
3094AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
3096 MachineBasicBlock *BB) const {
3097 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3098 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3099 unsigned StartIdx = 0;
3100
3101 bool HasTile = BaseReg != AArch64::ZA;
3102 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3103 if (HasZPROut) {
3104 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3105 ++StartIdx;
3106 }
3107 if (HasTile) {
3108 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3109 RegState::Define); // Output ZA Tile
3110 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3111 StartIdx++;
3112 } else {
3113 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3114 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3115 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3116 ++StartIdx;
3117 }
3118 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3119 }
3120 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3121 MIB.add(MI.getOperand(I));
3122
3123 MI.eraseFromParent(); // The pseudo is gone now.
3124 return BB;
3125}
3126
3129 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3131 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3132 MIB.add(MI.getOperand(0)); // Mask
3133
3134 unsigned Mask = MI.getOperand(0).getImm();
3135 for (unsigned I = 0; I < 8; I++) {
3136 if (Mask & (1 << I))
3137 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3138 }
3139
3140 MI.eraseFromParent(); // The pseudo is gone now.
3141 return BB;
3142}
3143
3146 MachineBasicBlock *BB) const {
3147 MachineFunction *MF = BB->getParent();
3148 MachineFrameInfo &MFI = MF->getFrameInfo();
3150 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3151 if (TPIDR2.Uses > 0) {
3152 // Note: This case just needs to do `SVL << 48`. It is not implemented as we
3153 // generally don't support big-endian SVE/SME.
3154 if (!Subtarget->isLittleEndian())
3156 "TPIDR2 block initialization is not supported on big-endian targets");
3157
3158 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3159 // Store buffer pointer and num_za_save_slices.
3160 // Bytes 10-15 are implicitly zeroed.
3161 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi))
3162 .addReg(MI.getOperand(0).getReg())
3163 .addReg(MI.getOperand(1).getReg())
3164 .addFrameIndex(TPIDR2.FrameIndex)
3165 .addImm(0);
3166 } else
3167 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3168
3169 BB->remove_instr(&MI);
3170 return BB;
3171}
3172
3175 MachineBasicBlock *BB) const {
3176 MachineFunction *MF = BB->getParent();
3177 MachineFrameInfo &MFI = MF->getFrameInfo();
3179 // TODO This function grows the stack with a subtraction, which doesn't work
3180 // on Windows. Some refactoring to share the functionality in
3181 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3182 // supports SME
3184 "Lazy ZA save is not yet supported on Windows");
3185
3186 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3187
3188 if (TPIDR2.Uses > 0) {
3189 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3191
3192 // The SUBXrs below won't always be emitted in a form that accepts SP
3193 // directly
3194 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3195 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3196 .addReg(AArch64::SP);
3197
3198 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3199 auto Size = MI.getOperand(1).getReg();
3200 auto Dest = MI.getOperand(0).getReg();
3201 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3202 .addReg(Size)
3203 .addReg(Size)
3204 .addReg(SP);
3205 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3206 AArch64::SP)
3207 .addReg(Dest);
3208
3209 // We have just allocated a variable sized object, tell this to PEI.
3210 MFI.CreateVariableSizedObject(Align(16), nullptr);
3211 }
3212
3213 BB->remove_instr(&MI);
3214 return BB;
3215}
3216
3217// TODO: Find a way to merge this with EmitAllocateZABuffer.
3220 MachineBasicBlock *BB) const {
3221 MachineFunction *MF = BB->getParent();
3222 MachineFrameInfo &MFI = MF->getFrameInfo();
3225 "Lazy ZA save is not yet supported on Windows");
3226
3227 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3228 if (FuncInfo->isSMESaveBufferUsed()) {
3229 // Allocate a buffer object of the size given by MI.getOperand(1).
3230 auto Size = MI.getOperand(1).getReg();
3231 auto Dest = MI.getOperand(0).getReg();
3232 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrx64), AArch64::SP)
3233 .addReg(AArch64::SP)
3234 .addReg(Size)
3236 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Dest)
3237 .addReg(AArch64::SP);
3238
3239 // We have just allocated a variable sized object, tell this to PEI.
3240 MFI.CreateVariableSizedObject(Align(16), nullptr);
3241 } else
3242 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
3243 MI.getOperand(0).getReg());
3244
3245 BB->remove_instr(&MI);
3246 return BB;
3247}
3248
3251 MachineBasicBlock *BB) const {
3252 // If the buffer is used, emit a call to __arm_sme_state_size()
3253 MachineFunction *MF = BB->getParent();
3255 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3256 if (FuncInfo->isSMESaveBufferUsed()) {
3257 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
3258 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3259 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
3261 .addReg(AArch64::X0, RegState::ImplicitDefine)
3262 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3263 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3264 MI.getOperand(0).getReg())
3265 .addReg(AArch64::X0);
3266 } else
3267 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3268 MI.getOperand(0).getReg())
3269 .addReg(AArch64::XZR);
3270 BB->remove_instr(&MI);
3271 return BB;
3272}
3273
3276 MachineBasicBlock *BB) const {
3277 MachineFunction *MF = BB->getParent();
3278 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3279 const DebugLoc &DL = MI.getDebugLoc();
3280 Register ResultReg = MI.getOperand(0).getReg();
3281 if (MF->getRegInfo().use_empty(ResultReg)) {
3282 // Nothing to do. Pseudo erased below.
3283 } else if (Subtarget->hasSME()) {
3284 BuildMI(*BB, MI, DL, TII->get(AArch64::MRS), ResultReg)
3285 .addImm(AArch64SysReg::SVCR)
3286 .addReg(AArch64::VG, RegState::Implicit);
3287 } else {
3288 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
3289 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3290 BuildMI(*BB, MI, DL, TII->get(AArch64::BL))
3292 .addReg(AArch64::X0, RegState::ImplicitDefine)
3293 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3294 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), ResultReg)
3295 .addReg(AArch64::X0);
3296 }
3297 MI.eraseFromParent();
3298 return BB;
3299}
3300
3301// Helper function to find the instruction that defined a virtual register.
3302// If unable to find such instruction, returns nullptr.
3304 Register Reg) {
3305 while (Reg.isVirtual()) {
3306 MachineInstr *DefMI = MRI.getVRegDef(Reg);
3307 assert(DefMI && "Virtual register definition not found");
3308 unsigned Opcode = DefMI->getOpcode();
3309
3310 if (Opcode == AArch64::COPY) {
3311 Reg = DefMI->getOperand(1).getReg();
3312 // Vreg is defined by copying from physreg.
3313 if (Reg.isPhysical())
3314 return DefMI;
3315 continue;
3316 }
3317 if (Opcode == AArch64::SUBREG_TO_REG) {
3318 Reg = DefMI->getOperand(2).getReg();
3319 continue;
3320 }
3321
3322 return DefMI;
3323 }
3324 return nullptr;
3325}
3326
3329 MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const {
3330 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3331 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
3332 const DebugLoc &DL = MI.getDebugLoc();
3333
3334 Register AddrDisc = AddrDiscOp.getReg();
3335 int64_t IntDisc = IntDiscOp.getImm();
3336 assert(IntDisc == 0 && "Blend components are already expanded");
3337
3338 const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc);
3339 if (DiscMI) {
3340 switch (DiscMI->getOpcode()) {
3341 case AArch64::MOVKXi:
3342 // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48".
3343 // #imm should be an immediate and not a global symbol, for example.
3344 if (DiscMI->getOperand(2).isImm() &&
3345 DiscMI->getOperand(3).getImm() == 48) {
3346 AddrDisc = DiscMI->getOperand(1).getReg();
3347 IntDisc = DiscMI->getOperand(2).getImm();
3348 }
3349 break;
3350 case AArch64::MOVi32imm:
3351 case AArch64::MOVi64imm:
3352 // Small immediate integer constant passed via VReg.
3353 if (DiscMI->getOperand(1).isImm() &&
3354 isUInt<16>(DiscMI->getOperand(1).getImm())) {
3355 AddrDisc = AArch64::NoRegister;
3356 IntDisc = DiscMI->getOperand(1).getImm();
3357 }
3358 break;
3359 }
3360 }
3361
3362 // For uniformity, always use NoRegister, as XZR is not necessarily contained
3363 // in the requested register class.
3364 if (AddrDisc == AArch64::XZR)
3365 AddrDisc = AArch64::NoRegister;
3366
3367 // Make sure AddrDisc operand respects the register class imposed by MI.
3368 if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) {
3369 Register TmpReg = MRI.createVirtualRegister(AddrDiscRC);
3370 BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc);
3371 AddrDisc = TmpReg;
3372 }
3373
3374 AddrDiscOp.setReg(AddrDisc);
3375 IntDiscOp.setImm(IntDisc);
3376}
3377
3379 MachineInstr &MI, MachineBasicBlock *BB) const {
3380
3381 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3382 if (SMEOrigInstr != -1) {
3383 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3384 uint64_t SMEMatrixType =
3385 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3386 switch (SMEMatrixType) {
3388 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3390 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3392 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3394 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3396 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3398 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3399 }
3400 }
3401
3402 switch (MI.getOpcode()) {
3403 default:
3404#ifndef NDEBUG
3405 MI.dump();
3406#endif
3407 llvm_unreachable("Unexpected instruction for custom inserter!");
3408 case AArch64::InitTPIDR2Obj:
3409 return EmitInitTPIDR2Object(MI, BB);
3410 case AArch64::AllocateZABuffer:
3411 return EmitAllocateZABuffer(MI, BB);
3412 case AArch64::AllocateSMESaveBuffer:
3413 return EmitAllocateSMESaveBuffer(MI, BB);
3414 case AArch64::GetSMESaveSize:
3415 return EmitGetSMESaveSize(MI, BB);
3416 case AArch64::EntryPStateSM:
3417 return EmitEntryPStateSM(MI, BB);
3418 case AArch64::F128CSEL:
3419 return EmitF128CSEL(MI, BB);
3420 case TargetOpcode::STATEPOINT:
3421 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3422 // while bl call instruction (where statepoint will be lowered at the end)
3423 // has implicit def. This def is early-clobber as it will be set at
3424 // the moment of the call and earlier than any use is read.
3425 // Add this implicit dead def here as a workaround.
3426 MI.addOperand(*MI.getMF(),
3428 AArch64::LR, /*isDef*/ true,
3429 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3430 /*isUndef*/ false, /*isEarlyClobber*/ true));
3431 [[fallthrough]];
3432 case TargetOpcode::STACKMAP:
3433 case TargetOpcode::PATCHPOINT:
3434 return emitPatchPoint(MI, BB);
3435
3436 case TargetOpcode::PATCHABLE_EVENT_CALL:
3437 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3438 return BB;
3439
3440 case AArch64::CATCHRET:
3441 return EmitLoweredCatchRet(MI, BB);
3442
3443 case AArch64::PROBED_STACKALLOC_DYN:
3444 return EmitDynamicProbedAlloc(MI, BB);
3445
3446 case AArch64::CHECK_MATCHING_VL_PSEUDO:
3447 return EmitCheckMatchingVL(MI, BB);
3448
3449 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3450 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3451 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3452 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3453 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3454 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3455 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3456 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3457 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3458 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3459 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3460 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3461 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3462 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3463 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3464 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3465 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3466 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3467 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3468 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3469 case AArch64::LDR_ZA_PSEUDO:
3470 return EmitFill(MI, BB);
3471 case AArch64::LDR_TX_PSEUDO:
3472 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3473 case AArch64::STR_TX_PSEUDO:
3474 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3475 case AArch64::ZERO_M_PSEUDO:
3476 return EmitZero(MI, BB);
3477 case AArch64::ZERO_T_PSEUDO:
3478 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3479 case AArch64::MOVT_TIZ_PSEUDO:
3480 return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3481
3482 case AArch64::PAC:
3483 fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4),
3484 &AArch64::GPR64noipRegClass);
3485 return BB;
3486 }
3487}
3488
3489//===----------------------------------------------------------------------===//
3490// AArch64 Lowering private implementation.
3491//===----------------------------------------------------------------------===//
3492
3493//===----------------------------------------------------------------------===//
3494// Lowering Code
3495//===----------------------------------------------------------------------===//
3496
3497// Forward declarations of SVE fixed length lowering helpers
3502 SelectionDAG &DAG);
3505 EVT VT);
3506
3507/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3508static bool isZerosVector(const SDNode *N) {
3509 // Look through a bit convert.
3510 while (N->getOpcode() == ISD::BITCAST)
3511 N = N->getOperand(0).getNode();
3512
3514 return true;
3515
3516 if (N->getOpcode() != AArch64ISD::DUP)
3517 return false;
3518
3519 auto Opnd0 = N->getOperand(0);
3520 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3521}
3522
3523/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3524/// CC
3526 SDValue RHS = {}) {
3527 switch (CC) {
3528 default:
3529 llvm_unreachable("Unknown condition code!");
3530 case ISD::SETNE:
3531 return AArch64CC::NE;
3532 case ISD::SETEQ:
3533 return AArch64CC::EQ;
3534 case ISD::SETGT:
3535 return AArch64CC::GT;
3536 case ISD::SETGE:
3538 case ISD::SETLT:
3540 case ISD::SETLE:
3541 return AArch64CC::LE;
3542 case ISD::SETUGT:
3543 return AArch64CC::HI;
3544 case ISD::SETUGE:
3545 return AArch64CC::HS;
3546 case ISD::SETULT:
3547 return AArch64CC::LO;
3548 case ISD::SETULE:
3549 return AArch64CC::LS;
3550 }
3551}
3552
3553/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3555 AArch64CC::CondCode &CondCode,
3556 AArch64CC::CondCode &CondCode2) {
3557 CondCode2 = AArch64CC::AL;
3558 switch (CC) {
3559 default:
3560 llvm_unreachable("Unknown FP condition!");
3561 case ISD::SETEQ:
3562 case ISD::SETOEQ:
3563 CondCode = AArch64CC::EQ;
3564 break;
3565 case ISD::SETGT:
3566 case ISD::SETOGT:
3567 CondCode = AArch64CC::GT;
3568 break;
3569 case ISD::SETGE:
3570 case ISD::SETOGE:
3571 CondCode = AArch64CC::GE;
3572 break;
3573 case ISD::SETOLT:
3574 CondCode = AArch64CC::MI;
3575 break;
3576 case ISD::SETOLE:
3577 CondCode = AArch64CC::LS;
3578 break;
3579 case ISD::SETONE:
3580 CondCode = AArch64CC::MI;
3581 CondCode2 = AArch64CC::GT;
3582 break;
3583 case ISD::SETO:
3584 CondCode = AArch64CC::VC;
3585 break;
3586 case ISD::SETUO:
3587 CondCode = AArch64CC::VS;
3588 break;
3589 case ISD::SETUEQ:
3590 CondCode = AArch64CC::EQ;
3591 CondCode2 = AArch64CC::VS;
3592 break;
3593 case ISD::SETUGT:
3594 CondCode = AArch64CC::HI;
3595 break;
3596 case ISD::SETUGE:
3597 CondCode = AArch64CC::PL;
3598 break;
3599 case ISD::SETLT:
3600 case ISD::SETULT:
3601 CondCode = AArch64CC::LT;
3602 break;
3603 case ISD::SETLE:
3604 case ISD::SETULE:
3605 CondCode = AArch64CC::LE;
3606 break;
3607 case ISD::SETNE:
3608 case ISD::SETUNE:
3609 CondCode = AArch64CC::NE;
3610 break;
3611 }
3612}
3613
3614/// Convert a DAG fp condition code to an AArch64 CC.
3615/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3616/// should be AND'ed instead of OR'ed.
3618 AArch64CC::CondCode &CondCode,
3619 AArch64CC::CondCode &CondCode2) {
3620 CondCode2 = AArch64CC::AL;
3621 switch (CC) {
3622 default:
3623 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3624 assert(CondCode2 == AArch64CC::AL);
3625 break;
3626 case ISD::SETONE:
3627 // (a one b)
3628 // == ((a olt b) || (a ogt b))
3629 // == ((a ord b) && (a une b))
3630 CondCode = AArch64CC::VC;
3631 CondCode2 = AArch64CC::NE;
3632 break;
3633 case ISD::SETUEQ:
3634 // (a ueq b)
3635 // == ((a uno b) || (a oeq b))
3636 // == ((a ule b) && (a uge b))
3637 CondCode = AArch64CC::PL;
3638 CondCode2 = AArch64CC::LE;
3639 break;
3640 }
3641}
3642
3643/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3644/// CC usable with the vector instructions. Fewer operations are available
3645/// without a real NZCV register, so we have to use less efficient combinations
3646/// to get the same effect.
3648 AArch64CC::CondCode &CondCode,
3649 AArch64CC::CondCode &CondCode2,
3650 bool &Invert) {
3651 Invert = false;
3652 switch (CC) {
3653 default:
3654 // Mostly the scalar mappings work fine.
3655 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3656 break;
3657 case ISD::SETUO:
3658 Invert = true;
3659 [[fallthrough]];
3660 case ISD::SETO:
3661 CondCode = AArch64CC::MI;
3662 CondCode2 = AArch64CC::GE;
3663 break;
3664 case ISD::SETUEQ:
3665 case ISD::SETULT:
3666 case ISD::SETULE:
3667 case ISD::SETUGT:
3668 case ISD::SETUGE:
3669 // All of the compare-mask comparisons are ordered, but we can switch
3670 // between the two by a double inversion. E.g. ULE == !OGT.
3671 Invert = true;
3672 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3673 CondCode, CondCode2);
3674 break;
3675 }
3676}
3677
3678/// Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
3680 // TODO: Should be TargetConstant (need to s/imm/timm in patterns).
3681 return DAG.getConstant(CC, SDLoc(), CondCodeVT);
3682}
3683
3685 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3686 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3687 LLVM_DEBUG(dbgs() << "Is imm " << C
3688 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3689 return IsLegal;
3690}
3691
3693 // Works for negative immediates too, as it can be written as an ADDS
3694 // instruction with a negated immediate.
3695 return isLegalArithImmed(C.abs().getZExtValue());
3696}
3697
3699 uint64_t Imm = C.getZExtValue();
3701 AArch64_IMM::expandMOVImm(Imm, 32, Insn);
3702 return Insn.size();
3703}
3704
3706 // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
3707 if (Op->getFlags().hasNoSignedWrap())
3708 return true;
3709
3710 // We can still figure out if the second operand is safe to use
3711 // in a CMN instruction by checking if it is known to be not the minimum
3712 // signed value. If it is not, then we can safely use CMN.
3713 // Note: We can eventually remove this check and simply rely on
3714 // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
3715 // consistently sets them appropriately when making said nodes.
3716
3717 KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));
3718 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3719}
3720
3721// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3722// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3723// can be set differently by this operation. It comes down to whether
3724// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3725// everything is fine. If not then the optimization is wrong. Thus general
3726// comparisons are only valid if op2 != 0 and op2 != INT_MIN.
3727//
3728// So, finally, the only LLVM-native comparisons that don't mention C or V
3729// are the ones that aren't unsigned comparisons. They're the only ones we can
3730// safely use CMN for in the absence of information about op2.
3732 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3733 (isIntEqualitySetCC(CC) ||
3734 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3735 (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));
3736}
3737
3739 SelectionDAG &DAG, SDValue Chain,
3740 bool IsSignaling) {
3741 EVT VT = LHS.getValueType();
3742 assert(VT != MVT::f128);
3743
3744 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3745
3746 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3747 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3748 {Chain, LHS});
3749 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3750 {LHS.getValue(1), RHS});
3751 Chain = RHS.getValue(1);
3752 }
3753 unsigned Opcode =
3754 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3755 return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS});
3756}
3757
3759 const SDLoc &DL, SelectionDAG &DAG) {
3760 EVT VT = LHS.getValueType();
3761 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3762
3763 if (VT.isFloatingPoint()) {
3764 assert(VT != MVT::f128);
3765 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3766 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3767 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3768 }
3769 return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS);
3770 }
3771
3772 // The CMP instruction is just an alias for SUBS, and representing it as
3773 // SUBS means that it's possible to get CSE with subtract operations.
3774 // A later phase can perform the optimization of setting the destination
3775 // register to WZR/XZR if it ends up being unused.
3776 unsigned Opcode = AArch64ISD::SUBS;
3777
3778 if (isCMN(RHS, CC, DAG)) {
3779 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3780 Opcode = AArch64ISD::ADDS;
3781 RHS = RHS.getOperand(1);
3782 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3783 isIntEqualitySetCC(CC)) {
3784 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3785 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3786 Opcode = AArch64ISD::ADDS;
3787 LHS = LHS.getOperand(1);
3788 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3789 if (LHS.getOpcode() == ISD::AND) {
3790 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3791 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3792 // of the signed comparisons.
3793 const SDValue ANDSNode =
3794 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT),
3795 LHS.getOperand(0), LHS.getOperand(1));
3796 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3797 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3798 return ANDSNode.getValue(1);
3799 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3800 // Use result of ANDS
3801 return LHS.getValue(1);
3802 }
3803 }
3804
3805 return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS)
3806 .getValue(1);
3807}
3808
3809/// \defgroup AArch64CCMP CMP;CCMP matching
3810///
3811/// These functions deal with the formation of CMP;CCMP;... sequences.
3812/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3813/// a comparison. They set the NZCV flags to a predefined value if their
3814/// predicate is false. This allows to express arbitrary conjunctions, for
3815/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3816/// expressed as:
3817/// cmp A
3818/// ccmp B, inv(CB), CA
3819/// check for CB flags
3820///
3821/// This naturally lets us implement chains of AND operations with SETCC
3822/// operands. And we can even implement some other situations by transforming
3823/// them:
3824/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3825/// negating the flags used in a CCMP/FCCMP operations.
3826/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3827/// by negating the flags we test for afterwards. i.e.
3828/// NEG (CMP CCMP CCCMP ...) can be implemented.
3829/// - Note that we can only ever negate all previously processed results.
3830/// What we can not implement by flipping the flags to test is a negation
3831/// of two sub-trees (because the negation affects all sub-trees emitted so
3832/// far, so the 2nd sub-tree we emit would also affect the first).
3833/// With those tools we can implement some OR operations:
3834/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3835/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3836/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3837/// elimination rules from earlier to implement the whole thing as a
3838/// CCMP/FCCMP chain.
3839///
3840/// As complete example:
3841/// or (or (setCA (cmp A)) (setCB (cmp B)))
3842/// (and (setCC (cmp C)) (setCD (cmp D)))"
3843/// can be reassociated to:
3844/// or (and (setCC (cmp C)) setCD (cmp D))
3845// (or (setCA (cmp A)) (setCB (cmp B)))
3846/// can be transformed to:
3847/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3848/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3849/// which can be implemented as:
3850/// cmp C
3851/// ccmp D, inv(CD), CC
3852/// ccmp A, CA, inv(CD)
3853/// ccmp B, CB, inv(CA)
3854/// check for CB flags
3855///
3856/// A counterexample is "or (and A B) (and C D)" which translates to
3857/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3858/// can only implement 1 of the inner (not) operations, but not both!
3859/// @{
3860
3861/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3863 ISD::CondCode CC, SDValue CCOp,
3865 AArch64CC::CondCode OutCC,
3866 const SDLoc &DL, SelectionDAG &DAG) {
3867 unsigned Opcode = 0;
3868 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3869
3870 if (LHS.getValueType().isFloatingPoint()) {
3871 assert(LHS.getValueType() != MVT::f128);
3872 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3873 LHS.getValueType() == MVT::bf16) {
3874 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3875 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3876 }
3877 Opcode = AArch64ISD::FCCMP;
3878 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3879 APInt Imm = Const->getAPIntValue();
3880 if (Imm.isNegative() && Imm.sgt(-32)) {
3881 Opcode = AArch64ISD::CCMN;
3882 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3883 }
3884 } else if (isCMN(RHS, CC, DAG)) {
3885 Opcode = AArch64ISD::CCMN;
3886 RHS = RHS.getOperand(1);
3887 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3888 isIntEqualitySetCC(CC)) {
3889 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3890 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3891 Opcode = AArch64ISD::CCMN;
3892 LHS = LHS.getOperand(1);
3893 }
3894 if (Opcode == 0)
3895 Opcode = AArch64ISD::CCMP;
3896
3897 SDValue Condition = getCondCode(DAG, Predicate);
3899 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3900 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3901 return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp);
3902}
3903
3904/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3905/// expressed as a conjunction. See \ref AArch64CCMP.
3906/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3907/// changing the conditions on the SETCC tests.
3908/// (this means we can call emitConjunctionRec() with
3909/// Negate==true on this sub-tree)
3910/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3911/// cannot do the negation naturally. We are required to
3912/// emit the subtree first in this case.
3913/// \param PreferFirst Set to true if processing this subtree first may
3914/// result in more efficient code.
3915/// \param WillNegate Is true if are called when the result of this
3916/// subexpression must be negated. This happens when the
3917/// outer expression is an OR. We can use this fact to know
3918/// that we have a double negation (or (or ...) ...) that
3919/// can be implemented for free.
3920static bool canEmitConjunction(SelectionDAG &DAG, const SDValue Val,
3921 bool &CanNegate, bool &MustBeFirst,
3922 bool &PreferFirst, bool WillNegate,
3923 unsigned Depth = 0) {
3924 if (!Val.hasOneUse())
3925 return false;
3926 unsigned Opcode = Val->getOpcode();
3927 if (Opcode == ISD::SETCC) {
3928 EVT VT = Val->getOperand(0).getValueType();
3929 if (VT == MVT::f128)
3930 return false;
3931 CanNegate = true;
3932 MustBeFirst = false;
3933 // Designate this operation as a preferred first operation if the result
3934 // of a SUB operation can be reused.
3935 PreferFirst = DAG.doesNodeExist(ISD::SUB, DAG.getVTList(VT),
3936 {Val->getOperand(0), Val->getOperand(1)});
3937 return true;
3938 }
3939 // Protect against exponential runtime and stack overflow.
3940 if (Depth > 6)
3941 return false;
3942 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3943 bool IsOR = Opcode == ISD::OR;
3944 SDValue O0 = Val->getOperand(0);
3945 SDValue O1 = Val->getOperand(1);
3946 bool CanNegateL;
3947 bool MustBeFirstL;
3948 bool PreferFirstL;
3949 if (!canEmitConjunction(DAG, O0, CanNegateL, MustBeFirstL, PreferFirstL,
3950 IsOR, Depth + 1))
3951 return false;
3952 bool CanNegateR;
3953 bool MustBeFirstR;
3954 bool PreferFirstR;
3955 if (!canEmitConjunction(DAG, O1, CanNegateR, MustBeFirstR, PreferFirstR,
3956 IsOR, Depth + 1))
3957 return false;
3958
3959 if (MustBeFirstL && MustBeFirstR)
3960 return false;
3961
3962 if (IsOR) {
3963 // For an OR expression we need to be able to naturally negate at least
3964 // one side or we cannot do the transformation at all.
3965 if (!CanNegateL && !CanNegateR)
3966 return false;
3967 // If we the result of the OR will be negated and we can naturally negate
3968 // the leafs, then this sub-tree as a whole negates naturally.
3969 CanNegate = WillNegate && CanNegateL && CanNegateR;
3970 // If we cannot naturally negate the whole sub-tree, then this must be
3971 // emitted first.
3972 MustBeFirst = !CanNegate;
3973 } else {
3974 assert(Opcode == ISD::AND && "Must be OR or AND");
3975 // We cannot naturally negate an AND operation.
3976 CanNegate = false;
3977 MustBeFirst = MustBeFirstL || MustBeFirstR;
3978 }
3979 PreferFirst = PreferFirstL || PreferFirstR;
3980 return true;
3981 }
3982 return false;
3983}
3984
3985/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3986/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3987/// Tries to transform the given i1 producing node @p Val to a series compare
3988/// and conditional compare operations. @returns an NZCV flags producing node
3989/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3990/// transformation was not possible.
3991/// \p Negate is true if we want this sub-tree being negated just by changing
3992/// SETCC conditions.
3994 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3996 // We're at a tree leaf, produce a conditional comparison operation.
3997 unsigned Opcode = Val->getOpcode();
3998 if (Opcode == ISD::SETCC) {
3999 SDValue LHS = Val->getOperand(0);
4000 SDValue RHS = Val->getOperand(1);
4001 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
4002 bool isInteger = LHS.getValueType().isInteger();
4003 if (Negate)
4004 CC = getSetCCInverse(CC, LHS.getValueType());
4005 SDLoc DL(Val);
4006 // Determine OutCC and handle FP special case.
4007 if (isInteger) {
4008 OutCC = changeIntCCToAArch64CC(CC, RHS);
4009 } else {
4010 assert(LHS.getValueType().isFloatingPoint());
4011 AArch64CC::CondCode ExtraCC;
4012 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
4013 // Some floating point conditions can't be tested with a single condition
4014 // code. Construct an additional comparison in this case.
4015 if (ExtraCC != AArch64CC::AL) {
4016 SDValue ExtraCmp;
4017 if (!CCOp.getNode())
4018 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
4019 else
4020 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
4021 ExtraCC, DL, DAG);
4022 CCOp = ExtraCmp;
4023 Predicate = ExtraCC;
4024 }
4025 }
4026
4027 // Produce a normal comparison if we are first in the chain
4028 if (!CCOp)
4029 return emitComparison(LHS, RHS, CC, DL, DAG);
4030 // Otherwise produce a ccmp.
4031 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
4032 DAG);
4033 }
4034 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
4035
4036 bool IsOR = Opcode == ISD::OR;
4037
4038 SDValue LHS = Val->getOperand(0);
4039 bool CanNegateL;
4040 bool MustBeFirstL;
4041 bool PreferFirstL;
4042 bool ValidL = canEmitConjunction(DAG, LHS, CanNegateL, MustBeFirstL,
4043 PreferFirstL, IsOR);
4044 assert(ValidL && "Valid conjunction/disjunction tree");
4045 (void)ValidL;
4046
4047 SDValue RHS = Val->getOperand(1);
4048 bool CanNegateR;
4049 bool MustBeFirstR;
4050 bool PreferFirstR;
4051 bool ValidR = canEmitConjunction(DAG, RHS, CanNegateR, MustBeFirstR,
4052 PreferFirstR, IsOR);
4053 assert(ValidR && "Valid conjunction/disjunction tree");
4054 (void)ValidR;
4055
4056 bool ShouldFirstL = PreferFirstL && !PreferFirstR && !MustBeFirstR;
4057
4058 // Swap sub-tree that must or should come first to the right side.
4059 if (MustBeFirstL || ShouldFirstL) {
4060 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4061 std::swap(LHS, RHS);
4062 std::swap(CanNegateL, CanNegateR);
4063 std::swap(MustBeFirstL, MustBeFirstR);
4064 }
4065
4066 bool NegateR;
4067 bool NegateAfterR;
4068 bool NegateL;
4069 bool NegateAfterAll;
4070 if (Opcode == ISD::OR) {
4071 // Swap the sub-tree that we can negate naturally to the left.
4072 if (!CanNegateL) {
4073 assert(CanNegateR && "at least one side must be negatable");
4074 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4075 assert(!Negate);
4076 std::swap(LHS, RHS);
4077 NegateR = false;
4078 NegateAfterR = true;
4079 } else {
4080 // Negate the left sub-tree if possible, otherwise negate the result.
4081 NegateR = CanNegateR;
4082 NegateAfterR = !CanNegateR;
4083 }
4084 NegateL = true;
4085 NegateAfterAll = !Negate;
4086 } else {
4087 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
4088 assert(!Negate && "Valid conjunction/disjunction tree");
4089
4090 NegateL = false;
4091 NegateR = false;
4092 NegateAfterR = false;
4093 NegateAfterAll = false;
4094 }
4095
4096 // Emit sub-trees.
4097 AArch64CC::CondCode RHSCC;
4098 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
4099 if (NegateAfterR)
4100 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
4101 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
4102 if (NegateAfterAll)
4103 OutCC = AArch64CC::getInvertedCondCode(OutCC);
4104 return CmpL;
4105}
4106
4107/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
4108/// In some cases this is even possible with OR operations in the expression.
4109/// See \ref AArch64CCMP.
4110/// \see emitConjunctionRec().
4112 AArch64CC::CondCode &OutCC) {
4113 bool DummyCanNegate;
4114 bool DummyMustBeFirst;
4115 bool DummyPreferFirst;
4116 if (!canEmitConjunction(DAG, Val, DummyCanNegate, DummyMustBeFirst,
4117 DummyPreferFirst, false))
4118 return SDValue();
4119
4120 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
4121}
4122
4123/// @}
4124
4125/// Returns how profitable it is to fold a comparison's operand's shift and/or
4126/// extension operations.
4128 auto isSupportedExtend = [&](SDValue V) {
4129 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
4130 return true;
4131
4132 if (V.getOpcode() == ISD::AND)
4133 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
4134 uint64_t Mask = MaskCst->getZExtValue();
4135 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
4136 }
4137
4138 return false;
4139 };
4140
4141 if (!Op.hasOneUse())
4142 return 0;
4143
4144 if (isSupportedExtend(Op))
4145 return 1;
4146
4147 unsigned Opc = Op.getOpcode();
4148 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
4149 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4150 uint64_t Shift = ShiftCst->getZExtValue();
4151 if (isSupportedExtend(Op.getOperand(0)))
4152 return (Shift <= 4) ? 2 : 1;
4153 EVT VT = Op.getValueType();
4154 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
4155 return 1;
4156 }
4157
4158 return 0;
4159}
4160
4161// emitComparison() converts comparison with one or negative one to comparison
4162// with 0. Note that this only works for signed comparisons because of how ANDS
4163// works.
4165 // Only works for ANDS and AND.
4166 if (LHS.getOpcode() != ISD::AND && LHS.getOpcode() != AArch64ISD::ANDS)
4167 return false;
4168
4169 if (C.isOne() && (CC == ISD::SETLT || CC == ISD::SETGE)) {
4170 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4171 return true;
4172 }
4173
4174 if (C.isAllOnes() && (CC == ISD::SETLE || CC == ISD::SETGT)) {
4175 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4176 return true;
4177 }
4178
4179 return false;
4180}
4181
4183 SDValue &AArch64cc, SelectionDAG &DAG,
4184 const SDLoc &DL) {
4185 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4186 EVT VT = RHS.getValueType();
4187 APInt C = RHSC->getAPIntValue();
4188 // shouldBeAdjustedToZero is a special case to better fold with
4189 // emitComparison().
4190 if (shouldBeAdjustedToZero(LHS, C, CC)) {
4191 // Adjust the constant to zero.
4192 // CC has already been adjusted.
4193 RHS = DAG.getConstant(0, DL, VT);
4194 } else if (!isLegalCmpImmed(C)) {
4195 unsigned NumImmForC = numberOfInstrToLoadImm(C);
4196 // Constant does not fit, try adjusting it by one?
4197 switch (CC) {
4198 default:
4199 break;
4200 case ISD::SETLT:
4201 case ISD::SETGE:
4202 if (!C.isMinSignedValue()) {
4203 APInt CMinusOne = C - 1;
4204 if (isLegalCmpImmed(CMinusOne) ||
4205 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4206 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4207 RHS = DAG.getConstant(CMinusOne, DL, VT);
4208 }
4209 }
4210 break;
4211 case ISD::SETULT:
4212 case ISD::SETUGE: {
4213 // C is not 0 because it is a legal immediate.
4214 assert(!C.isZero() && "C should not be zero here");
4215 APInt CMinusOne = C - 1;
4216 if (isLegalCmpImmed(CMinusOne) ||
4217 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4218 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4219 RHS = DAG.getConstant(CMinusOne, DL, VT);
4220 }
4221 break;
4222 }
4223 case ISD::SETLE:
4224 case ISD::SETGT:
4225 if (!C.isMaxSignedValue()) {
4226 APInt CPlusOne = C + 1;
4227 if (isLegalCmpImmed(CPlusOne) ||
4228 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4229 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4230 RHS = DAG.getConstant(CPlusOne, DL, VT);
4231 }
4232 }
4233 break;
4234 case ISD::SETULE:
4235 case ISD::SETUGT: {
4236 if (!C.isAllOnes()) {
4237 APInt CPlusOne = C + 1;
4238 if (isLegalCmpImmed(CPlusOne) ||
4239 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4240 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4241 RHS = DAG.getConstant(CPlusOne, DL, VT);
4242 }
4243 }
4244 break;
4245 }
4246 }
4247 }
4248 }
4249
4250 // Comparisons are canonicalized so that the RHS operand is simpler than the
4251 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
4252 // can fold some shift+extend operations on the RHS operand, so swap the
4253 // operands if that can be done.
4254 //
4255 // For example:
4256 // lsl w13, w11, #1
4257 // cmp w13, w12
4258 // can be turned into:
4259 // cmp w12, w11, lsl #1
4260 if (!isa<ConstantSDNode>(RHS) || !isLegalCmpImmed(RHS->getAsAPIntVal())) {
4261 bool LHSIsCMN = isCMN(LHS, CC, DAG);
4262 bool RHSIsCMN = isCMN(RHS, CC, DAG);
4263 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
4264 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
4265
4266 if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
4267 getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
4268 std::swap(LHS, RHS);
4270 }
4271 }
4272
4273 SDValue Cmp;
4275 if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
4277
4278 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
4279 // For the i8 operand, the largest immediate is 255, so this can be easily
4280 // encoded in the compare instruction. For the i16 operand, however, the
4281 // largest immediate cannot be encoded in the compare.
4282 // Therefore, use a sign extending load and cmn to avoid materializing the
4283 // -1 constant. For example,
4284 // movz w1, #65535
4285 // ldrh w0, [x0, #0]
4286 // cmp w0, w1
4287 // >
4288 // ldrsh w0, [x0, #0]
4289 // cmn w0, #1
4290 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
4291 // if and only if (sext LHS) == (sext RHS). The checks are in place to
4292 // ensure both the LHS and RHS are truly zero extended and to make sure the
4293 // transformation is profitable.
4294 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
4295 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
4296 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
4297 LHS.getNode()->hasNUsesOfValue(1, 0)) {
4298 int16_t ValueofRHS = RHS->getAsZExtVal();
4299 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
4300 SDValue SExt =
4301 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, LHS.getValueType(), LHS,
4302 DAG.getValueType(MVT::i16));
4303 Cmp = emitComparison(
4304 SExt, DAG.getSignedConstant(ValueofRHS, DL, RHS.getValueType()), CC,
4305 DL, DAG);
4307 }
4308 }
4309
4310 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
4311 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
4312 if ((CC == ISD::SETNE) ^ RHSC->isZero())
4314 }
4315 }
4316 }
4317
4318 if (!Cmp) {
4319 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
4321 }
4322 AArch64cc = getCondCode(DAG, AArch64CC);
4323 return Cmp;
4324}
4325
4326static std::pair<SDValue, SDValue>
4328 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
4329 "Unsupported value type");
4330 SDValue Value, Overflow;
4331 SDLoc DL(Op);
4332 SDValue LHS = Op.getOperand(0);
4333 SDValue RHS = Op.getOperand(1);
4334 unsigned Opc = 0;
4335 switch (Op.getOpcode()) {
4336 default:
4337 llvm_unreachable("Unknown overflow instruction!");
4338 case ISD::SADDO:
4339 Opc = AArch64ISD::ADDS;
4340 CC = AArch64CC::VS;
4341 break;
4342 case ISD::UADDO:
4343 Opc = AArch64ISD::ADDS;
4344 CC = AArch64CC::HS;
4345 break;
4346 case ISD::SSUBO:
4347 Opc = AArch64ISD::SUBS;
4348 CC = AArch64CC::VS;
4349 break;
4350 case ISD::USUBO:
4351 Opc = AArch64ISD::SUBS;
4352 CC = AArch64CC::LO;
4353 break;
4354 // Multiply needs a little bit extra work.
4355 case ISD::SMULO:
4356 case ISD::UMULO: {
4357 CC = AArch64CC::NE;
4358 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4359 if (Op.getValueType() == MVT::i32) {
4360 // Extend to 64-bits, then perform a 64-bit multiply.
4361 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4362 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4363 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4364 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4365 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4366
4367 // Check that the result fits into a 32-bit integer.
4368 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4369 if (IsSigned) {
4370 // cmp xreg, wreg, sxtw
4371 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4372 Overflow =
4373 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4374 } else {
4375 // tst xreg, #0xffffffff00000000
4376 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4377 Overflow =
4378 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4379 }
4380 break;
4381 }
4382 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4383 // For the 64 bit multiply
4384 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4385 if (IsSigned) {
4386 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4387 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4388 DAG.getConstant(63, DL, MVT::i64));
4389 // It is important that LowerBits is last, otherwise the arithmetic
4390 // shift will not be folded into the compare (SUBS).
4391 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4392 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4393 .getValue(1);
4394 } else {
4395 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4396 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4397 Overflow =
4398 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4399 DAG.getConstant(0, DL, MVT::i64),
4400 UpperBits).getValue(1);
4401 }
4402 break;
4403 }
4404 } // switch (...)
4405
4406 if (Opc) {
4407 SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT);
4408
4409 // Emit the AArch64 operation with overflow check.
4410 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4411 Overflow = Value.getValue(1);
4412 }
4413 return std::make_pair(Value, Overflow);
4414}
4415
4416SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4417 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4418 !Subtarget->isNeonAvailable()))
4419 return LowerToScalableOp(Op, DAG);
4420
4421 SDValue Sel = Op.getOperand(0);
4422 SDValue Other = Op.getOperand(1);
4423 SDLoc DL(Sel);
4424
4425 // If the operand is an overflow checking operation, invert the condition
4426 // code and kill the Not operation. I.e., transform:
4427 // (xor (overflow_op_bool, 1))
4428 // -->
4429 // (csel 1, 0, invert(cc), overflow_op_bool)
4430 // ... which later gets transformed to just a cset instruction with an
4431 // inverted condition code, rather than a cset + eor sequence.
4433 // Only lower legal XALUO ops.
4435 return SDValue();
4436
4437 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4438 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4440 SDValue Value, Overflow;
4441 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4442 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4443 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
4444 CCVal, Overflow);
4445 }
4446 // If neither operand is a SELECT_CC, give up.
4447 if (Sel.getOpcode() != ISD::SELECT_CC)
4448 std::swap(Sel, Other);
4449 if (Sel.getOpcode() != ISD::SELECT_CC)
4450 return Op;
4451
4452 // The folding we want to perform is:
4453 // (xor x, (select_cc a, b, cc, 0, -1) )
4454 // -->
4455 // (csel x, (xor x, -1), cc ...)
4456 //
4457 // The latter will get matched to a CSINV instruction.
4458
4459 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4460 SDValue LHS = Sel.getOperand(0);
4461 SDValue RHS = Sel.getOperand(1);
4462 SDValue TVal = Sel.getOperand(2);
4463 SDValue FVal = Sel.getOperand(3);
4464
4465 // FIXME: This could be generalized to non-integer comparisons.
4466 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4467 return Op;
4468
4469 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4470 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4471
4472 // The values aren't constants, this isn't the pattern we're looking for.
4473 if (!CFVal || !CTVal)
4474 return Op;
4475
4476 // We can commute the SELECT_CC by inverting the condition. This
4477 // might be needed to make this fit into a CSINV pattern.
4478 if (CTVal->isAllOnes() && CFVal->isZero()) {
4479 std::swap(TVal, FVal);
4480 std::swap(CTVal, CFVal);
4481 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4482 }
4483
4484 // If the constants line up, perform the transform!
4485 if (CTVal->isZero() && CFVal->isAllOnes()) {
4486 SDValue CCVal;
4487 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
4488
4489 FVal = Other;
4490 TVal = DAG.getNode(ISD::XOR, DL, Other.getValueType(), Other,
4491 DAG.getAllOnesConstant(DL, Other.getValueType()));
4492
4493 return DAG.getNode(AArch64ISD::CSEL, DL, Sel.getValueType(), FVal, TVal,
4494 CCVal, Cmp);
4495 }
4496
4497 return Op;
4498}
4499
4500// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4501// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4502// sets 'C' bit to 0.
4504 SDLoc DL(Value);
4505 EVT VT = Value.getValueType();
4506 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4507 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4508 SDValue Cmp =
4509 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1);
4510 return Cmp.getValue(1);
4511}
4512
4513// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4514// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4516 bool Invert) {
4517 assert(Glue.getResNo() == 1);
4518 SDLoc DL(Glue);
4519 SDValue Zero = DAG.getConstant(0, DL, VT);
4520 SDValue One = DAG.getConstant(1, DL, VT);
4522 SDValue CC = getCondCode(DAG, Cond);
4523 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4524}
4525
4526// Value is 1 if 'V' bit of NZCV is 1, else 0
4528 assert(Glue.getResNo() == 1);
4529 SDLoc DL(Glue);
4530 SDValue Zero = DAG.getConstant(0, DL, VT);
4531 SDValue One = DAG.getConstant(1, DL, VT);
4533 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4534}
4535
4536// This lowering is inefficient, but it will get cleaned up by
4537// `foldOverflowCheck`
4539 unsigned Opcode, bool IsSigned) {
4540 EVT VT0 = Op.getValue(0).getValueType();
4541 EVT VT1 = Op.getValue(1).getValueType();
4542
4543 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4544 return SDValue();
4545
4546 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4547 SDValue OpLHS = Op.getOperand(0);
4548 SDValue OpRHS = Op.getOperand(1);
4549 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4550
4551 SDLoc DL(Op);
4552
4553 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS,
4554 OpRHS, OpCarryIn);
4555
4556 SDValue OutFlag =
4557 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4558 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4559
4560 return DAG.getMergeValues({Sum, OutFlag}, DL);
4561}
4562
4563static SDValue lowerIntNeonIntrinsic(SDValue Op, unsigned Opcode,
4564 SelectionDAG &DAG) {
4565 SDLoc DL(Op);
4566 auto getFloatVT = [](EVT VT) {
4567 assert((VT == MVT::i32 || VT == MVT::i64) && "Unexpected VT");
4568 return VT == MVT::i32 ? MVT::f32 : MVT::f64;
4569 };
4570 auto bitcastToFloat = [&](SDValue Val) {
4571 return DAG.getBitcast(getFloatVT(Val.getValueType()), Val);
4572 };
4574 NewOps.reserve(Op.getNumOperands() - 1);
4575
4576 for (unsigned I = 1, E = Op.getNumOperands(); I < E; ++I)
4577 NewOps.push_back(bitcastToFloat(Op.getOperand(I)));
4578 EVT OrigVT = Op.getValueType();
4579 SDValue OpNode = DAG.getNode(Opcode, DL, getFloatVT(OrigVT), NewOps);
4580 return DAG.getBitcast(OrigVT, OpNode);
4581}
4582
4584 // Let legalize expand this if it isn't a legal type yet.
4585 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4586 return SDValue();
4587
4588 SDLoc DL(Op);
4590 // The actual operation that sets the overflow or carry flag.
4591 SDValue Value, Overflow;
4592 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4593
4594 // We use 0 and 1 as false and true values.
4595 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4596 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4597
4598 // We use an inverted condition, because the conditional select is inverted
4599 // too. This will allow it to be selected to a single instruction:
4600 // CSINC Wd, WZR, WZR, invert(cond).
4601 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4602 Overflow =
4603 DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);
4604
4605 return DAG.getMergeValues({Value, Overflow}, DL);
4606}
4607
4608// Prefetch operands are:
4609// 1: Address to prefetch
4610// 2: bool isWrite
4611// 3: int locality (0 = no locality ... 3 = extreme locality)
4612// 4: bool isDataCache
4614 SDLoc DL(Op);
4615 unsigned IsWrite = Op.getConstantOperandVal(2);
4616 unsigned Locality = Op.getConstantOperandVal(3);
4617 unsigned IsData = Op.getConstantOperandVal(4);
4618
4619 bool IsStream = !Locality;
4620 // When the locality number is set
4621 if (Locality) {
4622 // The front-end should have filtered out the out-of-range values
4623 assert(Locality <= 3 && "Prefetch locality out-of-range");
4624 // The locality degree is the opposite of the cache speed.
4625 // Put the number the other way around.
4626 // The encoding starts at 0 for level 1
4627 Locality = 3 - Locality;
4628 }
4629
4630 // built the mask value encoding the expected behavior.
4631 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4632 (!IsData << 3) | // IsDataCache bit
4633 (Locality << 1) | // Cache level bits
4634 (unsigned)IsStream; // Stream bit
4635 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4636 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4637 Op.getOperand(1));
4638}
4639
4640// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4641// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4642// (AND X Y) Z which produces a better opt with EmitComparison
4644 SelectionDAG &DAG, const SDLoc DL) {
4645 if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4646 ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4648 if (LHSConstOp && RHSConst) {
4649 uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4650 uint64_t RHSConstant = RHSConst->getZExtValue();
4651 if (isPowerOf2_64(RHSConstant)) {
4652 uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4653 LHS =
4654 DAG.getNode(ISD::AND, DL, LHS.getValueType(), LHS.getOperand(0),
4655 DAG.getConstant(NewMaskValue, DL, LHS.getValueType()));
4656 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4657 CC = ISD::SETEQ;
4658 }
4659 }
4660 }
4661}
4662
4663SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4664 SelectionDAG &DAG) const {
4665 EVT VT = Op.getValueType();
4666 if (VT.isScalableVector()) {
4667 SDValue SrcVal = Op.getOperand(0);
4668
4669 if (VT == MVT::nxv2f64 && SrcVal.getValueType() == MVT::nxv2bf16) {
4670 // Break conversion in two with the first part converting to f32 and the
4671 // second using native f32->VT instructions.
4672 SDLoc DL(Op);
4673 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
4674 DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
4675 }
4676
4677 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4678 }
4679
4680 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4681 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4682
4683 bool IsStrict = Op->isStrictFPOpcode();
4684 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
4685 EVT Op0VT = Op0.getValueType();
4686 if (VT == MVT::f64) {
4687 // FP16->FP32 extends are legal for v32 and v4f32.
4688 if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4689 return Op;
4690 // Split bf16->f64 extends into two fpextends.
4691 if (Op0VT == MVT::bf16 && IsStrict) {
4692 SDValue Ext1 =
4693 DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
4694 {Op0, Op.getOperand(0)});
4695 return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
4696 {Ext1, Ext1.getValue(1)});
4697 }
4698 if (Op0VT == MVT::bf16)
4699 return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
4700 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
4701 return SDValue();
4702 }
4703
4704 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4705 return SDValue();
4706}
4707
4708SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4709 SelectionDAG &DAG) const {
4710 EVT VT = Op.getValueType();
4711 bool IsStrict = Op->isStrictFPOpcode();
4712 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4713 EVT SrcVT = SrcVal.getValueType();
4714 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4715
4716 if (VT.isScalableVector()) {
4717 // Let common code split the operation.
4718 if (SrcVT == MVT::nxv8f32)
4719 return Op;
4720
4721 if (VT.getScalarType() != MVT::bf16)
4722 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4723
4724 SDLoc DL(Op);
4725 constexpr EVT I32 = MVT::nxv4i32;
4726 auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
4727
4728 SDValue NaN;
4729 SDValue Narrow;
4730
4731 if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4732 if (Subtarget->hasBF16())
4733 return LowerToPredicatedOp(Op, DAG,
4734 AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4735
4736 Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
4737
4738 // Set the quiet bit.
4739 if (!DAG.isKnownNeverSNaN(SrcVal))
4740 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
4741 } else if (SrcVT == MVT::nxv2f64 &&
4742 (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4743 // Round to float without introducing rounding errors and try again.
4744 SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);
4745 Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,
4746 Pg, SrcVal, DAG.getUNDEF(MVT::nxv2f32));
4747
4749 if (IsStrict)
4750 NewOps.push_back(Op.getOperand(0));
4751 NewOps.push_back(Narrow);
4752 NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));
4753 return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());
4754 } else
4755 return SDValue();
4756
4757 if (!Trunc) {
4758 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4759 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
4760 SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
4761 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4762 }
4763
4764 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4765 // 0x80000000.
4766 if (NaN) {
4767 EVT I1 = I32.changeElementType(MVT::i1);
4768 EVT CondVT = VT.changeElementType(MVT::i1);
4769 SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
4770 IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
4771 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4772 }
4773
4774 // Now that we have rounded, shift the bits into position.
4775 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4776 return getSVESafeBitCast(VT, Narrow, DAG);
4777 }
4778
4779 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4780 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4781
4782 // Expand cases where the result type is BF16 but we don't have hardware
4783 // instructions to lower it.
4784 if (VT.getScalarType() == MVT::bf16 &&
4785 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4786 Subtarget->hasBF16())) {
4787 SDLoc DL(Op);
4788 SDValue Narrow = SrcVal;
4789 SDValue NaN;
4790 EVT I32 = SrcVT.changeElementType(MVT::i32);
4791 EVT F32 = SrcVT.changeElementType(MVT::f32);
4792 if (SrcVT.getScalarType() == MVT::f32) {
4793 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4794 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4795 if (!NeverSNaN) {
4796 // Set the quiet bit.
4797 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow,
4798 DAG.getConstant(0x400000, DL, I32));
4799 }
4800 } else if (SrcVT.getScalarType() == MVT::f64) {
4801 Narrow = DAG.getNode(AArch64ISD::FCVTXN, DL, F32, Narrow);
4802 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4803 } else {
4804 return SDValue();
4805 }
4806 if (!Trunc) {
4807 SDValue One = DAG.getConstant(1, DL, I32);
4808 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4809 DAG.getShiftAmountConstant(16, I32, DL));
4810 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, One);
4811 SDValue RoundingBias =
4812 DAG.getNode(ISD::ADD, DL, I32, DAG.getConstant(0x7fff, DL, I32), Lsb);
4813 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4814 }
4815
4816 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4817 // 0x80000000.
4818 if (NaN) {
4819 SDValue IsNaN = DAG.getSetCC(
4820 DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4821 SrcVal, SrcVal, ISD::SETUO);
4822 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4823 }
4824
4825 // Now that we have rounded, shift the bits into position.
4826 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4827 DAG.getShiftAmountConstant(16, I32, DL));
4828 if (VT.isVector()) {
4829 EVT I16 = I32.changeVectorElementType(MVT::i16);
4830 Narrow = DAG.getNode(ISD::TRUNCATE, DL, I16, Narrow);
4831 return DAG.getNode(ISD::BITCAST, DL, VT, Narrow);
4832 }
4833 Narrow = DAG.getNode(ISD::BITCAST, DL, F32, Narrow);
4834 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Narrow);
4835 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, DL)
4836 : Result;
4837 }
4838
4839 if (SrcVT != MVT::f128) {
4840 // Expand cases where the input is a vector bigger than NEON.
4842 return SDValue();
4843
4844 // It's legal except when f128 is involved
4845 return Op;
4846 }
4847
4848 return SDValue();
4849}
4850
4851SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4852 SelectionDAG &DAG) const {
4853 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4854 // Any additional optimization in this function should be recorded
4855 // in the cost tables.
4856 bool IsStrict = Op->isStrictFPOpcode();
4857 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4858 EVT VT = Op.getValueType();
4859
4860 assert(!(IsStrict && VT.isScalableVector()) &&
4861 "Unimplemented SVE support for STRICT_FP_to_INT!");
4862
4863 // f16 conversions are promoted to f32 when full fp16 is not supported.
4864 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4865 InVT.getVectorElementType() == MVT::bf16) {
4866 EVT NewVT = VT.changeElementType(MVT::f32);
4867 SDLoc DL(Op);
4868 if (IsStrict) {
4869 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {NewVT, MVT::Other},
4870 {Op.getOperand(0), Op.getOperand(1)});
4871 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4872 {Ext.getValue(1), Ext.getValue(0)});
4873 }
4874 return DAG.getNode(
4875 Op.getOpcode(), DL, Op.getValueType(),
4876 DAG.getNode(ISD::FP_EXTEND, DL, NewVT, Op.getOperand(0)));
4877 }
4878
4879 if (VT.isScalableVector()) {
4880 if (VT.getVectorElementType() == MVT::i1) {
4881 SDLoc DL(Op);
4882 EVT CvtVT = getPromotedVTForPredicate(VT);
4883 SDValue Cvt = DAG.getNode(Op.getOpcode(), DL, CvtVT, Op.getOperand(0));
4884 SDValue Zero = DAG.getConstant(0, DL, CvtVT);
4885 return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE);
4886 }
4887
4888 // Let common code split the operation.
4889 if (InVT == MVT::nxv8f32)
4890 return Op;
4891
4892 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4893 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4894 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4895 return LowerToPredicatedOp(Op, DAG, Opcode);
4896 }
4897
4898 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4899 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4900 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4901
4902 uint64_t VTSize = VT.getFixedSizeInBits();
4903 uint64_t InVTSize = InVT.getFixedSizeInBits();
4904 if (VTSize < InVTSize) {
4905 SDLoc DL(Op);
4906 if (IsStrict) {
4908 SDValue Cv = DAG.getNode(Op.getOpcode(), DL, {InVT, MVT::Other},
4909 {Op.getOperand(0), Op.getOperand(1)});
4910 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4911 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, DL);
4912 }
4913 SDValue Cv =
4914 DAG.getNode(Op.getOpcode(), DL, InVT.changeVectorElementTypeToInteger(),
4915 Op.getOperand(0));
4916 return DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4917 }
4918
4919 if (VTSize > InVTSize) {
4920 SDLoc DL(Op);
4921 MVT ExtVT =
4924 if (IsStrict) {
4925 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {ExtVT, MVT::Other},
4926 {Op.getOperand(0), Op.getOperand(1)});
4927 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4928 {Ext.getValue(1), Ext.getValue(0)});
4929 }
4930 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, ExtVT, Op.getOperand(0));
4931 return DAG.getNode(Op.getOpcode(), DL, VT, Ext);
4932 }
4933
4934 // Use a scalar operation for conversions between single-element vectors of
4935 // the same size.
4936 if (InVT.getVectorNumElements() == 1) {
4937 SDLoc DL(Op);
4938 SDValue Extract = DAG.getNode(
4940 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, DL, MVT::i64));
4941 EVT ScalarVT = VT.getScalarType();
4942 if (IsStrict)
4943 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
4944 {Op.getOperand(0), Extract});
4945 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
4946 }
4947
4948 // Type changing conversions are illegal.
4949 return Op;
4950}
4951
4952SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4953 SelectionDAG &DAG) const {
4954 bool IsStrict = Op->isStrictFPOpcode();
4955 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4956
4957 if (SrcVal.getValueType().isVector())
4958 return LowerVectorFP_TO_INT(Op, DAG);
4959
4960 // f16 conversions are promoted to f32 when full fp16 is not supported.
4961 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4962 SrcVal.getValueType() == MVT::bf16) {
4963 SDLoc DL(Op);
4964 if (IsStrict) {
4965 SDValue Ext =
4966 DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
4967 {Op.getOperand(0), SrcVal});
4968 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
4969 {Ext.getValue(1), Ext.getValue(0)});
4970 }
4971 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(),
4972 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, SrcVal));
4973 }
4974
4975 if (SrcVal.getValueType() != MVT::f128) {
4976 // It's legal except when f128 is involved
4977 return Op;
4978 }
4979
4980 return SDValue();
4981}
4982
4983SDValue
4984AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4985 SelectionDAG &DAG) const {
4986 // AArch64 FP-to-int conversions saturate to the destination element size, so
4987 // we can lower common saturating conversions to simple instructions.
4988 SDValue SrcVal = Op.getOperand(0);
4989 EVT SrcVT = SrcVal.getValueType();
4990 EVT DstVT = Op.getValueType();
4991 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4992
4993 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4994 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4995 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4996 assert(SatWidth <= DstElementWidth &&
4997 "Saturation width cannot exceed result width");
4998
4999 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
5000 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
5001 // types, so this is hard to reach.
5002 if (DstVT.isScalableVector())
5003 return SDValue();
5004
5005 EVT SrcElementVT = SrcVT.getVectorElementType();
5006
5007 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
5008 SDLoc DL(Op);
5009 SDValue SrcVal2;
5010 if ((SrcElementVT == MVT::f16 &&
5011 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
5012 SrcElementVT == MVT::bf16) {
5013 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
5014 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
5015 // If we are extending to a v8f32, split into two v4f32 to produce legal
5016 // types.
5017 if (F32VT.getSizeInBits() > 128) {
5018 std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
5019 F32VT = F32VT.getHalfNumVectorElementsVT();
5020 }
5021 SrcVT = F32VT;
5022 SrcElementVT = MVT::f32;
5023 SrcElementWidth = 32;
5024 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
5025 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
5026 return SDValue();
5027
5028 // Expand to f64 if we are saturating to i64, to help keep the lanes the same
5029 // width and produce a fcvtzu.
5030 if (SatWidth == 64 && SrcElementWidth < 64) {
5031 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
5032 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
5033 SrcVT = F64VT;
5034 SrcElementVT = MVT::f64;
5035 SrcElementWidth = 64;
5036 }
5037 // Cases that we can emit directly.
5038 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
5039 SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
5040 DAG.getValueType(DstVT.getScalarType()));
5041 if (SrcVal2) {
5042 SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
5043 DAG.getValueType(DstVT.getScalarType()));
5044 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
5045 }
5046 return Res;
5047 }
5048
5049 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
5050 // result. This is only valid if the legal cvt is larger than the saturate
5051 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
5052 // (at least until sqxtn is selected).
5053 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
5054 return SDValue();
5055
5056 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
5057 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
5058 DAG.getValueType(IntVT.getScalarType()));
5059 SDValue NativeCvt2 =
5060 SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
5061 DAG.getValueType(IntVT.getScalarType()))
5062 : SDValue();
5063 SDValue Sat, Sat2;
5064 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5065 SDValue MinC = DAG.getConstant(
5066 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
5067 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
5068 SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
5069 SDValue MaxC = DAG.getConstant(
5070 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
5071 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
5072 Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
5073 } else {
5074 SDValue MinC = DAG.getConstant(
5075 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
5076 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
5077 Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
5078 }
5079
5080 if (SrcVal2)
5081 Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
5083 Sat, Sat2);
5084
5085 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5086}
5087
5088SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
5089 SelectionDAG &DAG) const {
5090 // AArch64 FP-to-int conversions saturate to the destination register size, so
5091 // we can lower common saturating conversions to simple instructions.
5092 SDValue SrcVal = Op.getOperand(0);
5093 EVT SrcVT = SrcVal.getValueType();
5094
5095 if (SrcVT.isVector())
5096 return LowerVectorFP_TO_INT_SAT(Op, DAG);
5097
5098 EVT DstVT = Op.getValueType();
5099 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5100 uint64_t SatWidth = SatVT.getScalarSizeInBits();
5101 uint64_t DstWidth = DstVT.getScalarSizeInBits();
5102 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
5103
5104 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
5105 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
5106 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
5107 SrcVT = MVT::f32;
5108 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
5109 SrcVT != MVT::bf16)
5110 return SDValue();
5111
5112 SDLoc DL(Op);
5113 // Cases that we can emit directly.
5114 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
5115 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
5116 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
5117 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
5118 DAG.getValueType(DstVT));
5119
5120 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
5121 // result. This is only valid if the legal cvt is larger than the saturate
5122 // width.
5123 if (DstWidth < SatWidth)
5124 return SDValue();
5125
5126 if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) {
5127 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5128 SDValue CVTf32 =
5129 DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal);
5130 SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32);
5131 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast,
5132 DAG.getValueType(SatVT));
5133 }
5134 SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal);
5135 return DAG.getBitcast(DstVT, CVTf32);
5136 }
5137
5138 SDValue NativeCvt =
5139 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
5140 SDValue Sat;
5141 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5142 SDValue MinC = DAG.getConstant(
5143 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
5144 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
5145 SDValue MaxC = DAG.getConstant(
5146 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
5147 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
5148 } else {
5149 SDValue MinC = DAG.getConstant(
5150 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
5151 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
5152 }
5153
5154 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5155}
5156
5157SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
5158 SelectionDAG &DAG) const {
5159 EVT VT = Op.getValueType();
5160 SDValue Src = Op.getOperand(0);
5161 SDLoc DL(Op);
5162
5163 assert(VT.isVector() && "Expected vector type");
5164
5165 EVT CastVT =
5166 VT.changeVectorElementType(Src.getValueType().getVectorElementType());
5167
5168 // Round the floating-point value into a floating-point register with the
5169 // current rounding mode.
5170 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
5171
5172 // Truncate the rounded floating point to an integer.
5173 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
5175}
5176
5177SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
5178 SelectionDAG &DAG) const {
5179 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
5180 // Any additional optimization in this function should be recorded
5181 // in the cost tables.
5182 bool IsStrict = Op->isStrictFPOpcode();
5183 EVT VT = Op.getValueType();
5184 SDLoc DL(Op);
5185 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
5186 EVT InVT = In.getValueType();
5187 unsigned Opc = Op.getOpcode();
5188 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
5189
5190 assert(!(IsStrict && VT.isScalableVector()) &&
5191 "Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");
5192
5193 // NOTE: i1->bf16 does not require promotion to f32.
5194 if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) {
5195 SDValue FalseVal = DAG.getConstantFP(0.0, DL, VT);
5196 SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, DL, VT)
5197 : DAG.getConstantFP(1.0, DL, VT);
5198 return DAG.getNode(ISD::VSELECT, DL, VT, In, TrueVal, FalseVal);
5199 }
5200
5201 // Promote bf16 conversions to f32.
5202 if (VT.getVectorElementType() == MVT::bf16) {
5203 EVT F32 = VT.changeElementType(MVT::f32);
5204 if (IsStrict) {
5205 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {F32, MVT::Other},
5206 {Op.getOperand(0), In});
5207 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5208 {Op.getValueType(), MVT::Other},
5209 {Val.getValue(1), Val.getValue(0),
5210 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5211 }
5212 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5213 DAG.getNode(Op.getOpcode(), DL, F32, In),
5214 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5215 }
5216
5217 if (VT.isScalableVector()) {
5218 // Let common code split the operation.
5219 if (VT == MVT::nxv8f32)
5220 return Op;
5221
5222 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
5223 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
5224 return LowerToPredicatedOp(Op, DAG, Opcode);
5225 }
5226
5227 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
5228 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
5229 return LowerFixedLengthIntToFPToSVE(Op, DAG);
5230
5231 uint64_t VTSize = VT.getFixedSizeInBits();
5232 uint64_t InVTSize = InVT.getFixedSizeInBits();
5233 if (VTSize < InVTSize) {
5234 // AArch64 doesn't have a direct vector instruction to convert
5235 // fixed point to floating point AND narrow it at the same time.
5236 // Additional rounding when the target is f32/f64 causes double
5237 // rounding issues. Conversion to f16 is fine due to narrow width.
5238 bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;
5239 bool IsTargetf16 = false;
5240 if (Op.hasOneUse() &&
5241 Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
5242 // Some vector types are split during legalization into half, followed by
5243 // concatenation, followed by rounding to the original vector type. If we
5244 // end up resolving to f16 type, we shouldn't worry about rounding errors.
5245 SDNode *U = *Op->user_begin();
5246 if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
5247 EVT TmpVT = U->user_begin()->getValueType(0);
5248 if (TmpVT.getScalarType() == MVT::f16)
5249 IsTargetf16 = true;
5250 }
5251 }
5252
5253 if (IsTargetf32 && !IsTargetf16) {
5254 return !IsStrict ? DAG.UnrollVectorOp(Op.getNode()) : SDValue();
5255 }
5256
5257 MVT CastVT =
5259 InVT.getVectorNumElements());
5260 if (IsStrict) {
5261 In = DAG.getNode(Opc, DL, {CastVT, MVT::Other}, {Op.getOperand(0), In});
5262 return DAG.getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
5263 {In.getValue(1), In.getValue(0),
5264 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5265 }
5266 In = DAG.getNode(Opc, DL, CastVT, In);
5267 return DAG.getNode(ISD::FP_ROUND, DL, VT, In,
5268 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5269 }
5270
5271 if (VTSize > InVTSize) {
5272 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5273 EVT CastVT = VT.changeVectorElementTypeToInteger();
5274 In = DAG.getNode(CastOpc, DL, CastVT, In);
5275 if (IsStrict)
5276 return DAG.getNode(Opc, DL, {VT, MVT::Other}, {Op.getOperand(0), In});
5277 return DAG.getNode(Opc, DL, VT, In);
5278 }
5279
5280 // Use a scalar operation for conversions between single-element vectors of
5281 // the same size.
5282 if (VT.getVectorNumElements() == 1) {
5283 SDValue Extract =
5285 DAG.getConstant(0, DL, MVT::i64));
5286 EVT ScalarVT = VT.getScalarType();
5287 if (IsStrict)
5288 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
5289 {Op.getOperand(0), Extract});
5290 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
5291 }
5292
5293 return Op;
5294}
5295
5296SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
5297 SelectionDAG &DAG) const {
5298 if (Op.getValueType().isVector())
5299 return LowerVectorINT_TO_FP(Op, DAG);
5300
5301 bool IsStrict = Op->isStrictFPOpcode();
5302 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5303
5304 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
5305 Op->getOpcode() == ISD::SINT_TO_FP;
5306
5307 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
5308 SDLoc DL(Op);
5309 if (IsStrict) {
5310 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {PromoteVT, MVT::Other},
5311 {Op.getOperand(0), SrcVal});
5312 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5313 {Op.getValueType(), MVT::Other},
5314 {Val.getValue(1), Val.getValue(0),
5315 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5316 }
5317 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5318 DAG.getNode(Op.getOpcode(), DL, PromoteVT, SrcVal),
5319 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5320 };
5321
5322 if (Op.getValueType() == MVT::bf16) {
5323 unsigned MaxWidth = IsSigned
5324 ? DAG.ComputeMaxSignificantBits(SrcVal)
5325 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
5326 // bf16 conversions are promoted to f32 when converting from i16.
5327 if (MaxWidth <= 24) {
5328 return IntToFpViaPromotion(MVT::f32);
5329 }
5330
5331 // bf16 conversions are promoted to f64 when converting from i32.
5332 if (MaxWidth <= 53) {
5333 return IntToFpViaPromotion(MVT::f64);
5334 }
5335
5336 // We need to be careful about i64 -> bf16.
5337 // Consider an i32 22216703.
5338 // This number cannot be represented exactly as an f32 and so a itofp will
5339 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
5340 // However, the correct bf16 was supposed to be 22151168.0
5341 // We need to use sticky rounding to get this correct.
5342 if (SrcVal.getValueType() == MVT::i64) {
5343 SDLoc DL(Op);
5344 // This algorithm is equivalent to the following:
5345 // uint64_t SrcHi = SrcVal & ~0xfffull;
5346 // uint64_t SrcLo = SrcVal & 0xfffull;
5347 // uint64_t Highest = SrcVal >> 53;
5348 // bool HasHighest = Highest != 0;
5349 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
5350 // double Rounded = static_cast<double>(ToRound);
5351 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
5352 // uint64_t HasLo = SrcLo != 0;
5353 // bool NeedsAdjustment = HasHighest & HasLo;
5354 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
5355 // double Adjusted = std::bit_cast<double>(AdjustedBits);
5356 // return static_cast<__bf16>(Adjusted);
5357 //
5358 // Essentially, what happens is that SrcVal either fits perfectly in a
5359 // double-precision value or it is too big. If it is sufficiently small,
5360 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
5361 // ensure that u64 -> double has no rounding error by only using the 52
5362 // MSB of the input. The low order bits will get merged into a sticky bit
5363 // which will avoid issues incurred by double rounding.
5364
5365 // Signed conversion is more or less like so:
5366 // copysign((__bf16)abs(SrcVal), SrcVal)
5367 SDValue SignBit;
5368 if (IsSigned) {
5369 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5370 DAG.getConstant(1ull << 63, DL, MVT::i64));
5371 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
5372 }
5373 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5374 DAG.getConstant(~0xfffull, DL, MVT::i64));
5375 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5376 DAG.getConstant(0xfffull, DL, MVT::i64));
5377 SDValue Highest =
5378 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
5379 DAG.getShiftAmountConstant(53, MVT::i64, DL));
5380 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
5381 SDValue ToRound =
5382 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
5383 SDValue Rounded =
5384 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
5385 {Op.getOperand(0), ToRound})
5386 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
5387
5388 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
5389 if (SignBit) {
5390 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
5391 }
5392
5393 SDValue HasHighest = DAG.getSetCC(
5394 DL,
5395 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5396 Highest, Zero64, ISD::SETNE);
5397
5398 SDValue HasLo = DAG.getSetCC(
5399 DL,
5400 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5401 SrcLo, Zero64, ISD::SETNE);
5402
5403 SDValue NeedsAdjustment =
5404 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
5405 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
5406
5407 SDValue AdjustedBits =
5408 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
5409 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
5410 return IsStrict
5411 ? DAG.getNode(
5413 {Op.getValueType(), MVT::Other},
5414 {Rounded.getValue(1), Adjusted,
5415 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})
5416 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
5417 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5418 }
5419 }
5420
5421 // f16 conversions are promoted to f32 when full fp16 is not supported.
5422 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5423 return IntToFpViaPromotion(MVT::f32);
5424 }
5425
5426 // i128 conversions are libcalls.
5427 if (SrcVal.getValueType() == MVT::i128)
5428 return SDValue();
5429
5430 // Other conversions are legal, unless it's to the completely software-based
5431 // fp128.
5432 if (Op.getValueType() != MVT::f128)
5433 return Op;
5434 return SDValue();
5435}
5436
5437static MVT getSVEContainerType(EVT ContentTy);
5438
5439SDValue
5440AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
5441 SelectionDAG &DAG) const {
5442 SDLoc DL(Op);
5443 EVT VT = Op.getValueType();
5444 SDValue EltSize = Op.getOperand(2);
5445 switch (EltSize->getAsZExtVal()) {
5446 case 1:
5447 if (VT != MVT::v16i8 && VT != MVT::nxv16i1)
5448 return SDValue();
5449 break;
5450 case 2:
5451 if (VT != MVT::v8i8 && VT != MVT::nxv8i1)
5452 return SDValue();
5453 break;
5454 case 4:
5455 if (VT != MVT::v4i16 && VT != MVT::nxv4i1)
5456 return SDValue();
5457 break;
5458 case 8:
5459 if (VT != MVT::v2i32 && VT != MVT::nxv2i1)
5460 return SDValue();
5461 break;
5462 default:
5463 // Other element sizes are incompatible with whilewr/rw, so expand instead
5464 return SDValue();
5465 }
5466
5467 SDValue LaneOffset = Op.getOperand(3);
5468 if (LaneOffset->getAsZExtVal())
5469 return SDValue();
5470
5471 SDValue PtrA = Op.getOperand(0);
5472 SDValue PtrB = Op.getOperand(1);
5473
5474 if (VT.isScalableVT())
5475 return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, EltSize, LaneOffset);
5476
5477 // We can use the SVE whilewr/whilerw instruction to lower this
5478 // intrinsic by creating the appropriate sequence of scalable vector
5479 // operations and then extracting a fixed-width subvector from the scalable
5480 // vector. Scalable vector variants are already legal.
5481 EVT ContainerVT =
5483 VT.getVectorNumElements(), true);
5484 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
5485
5486 SDValue Mask =
5487 DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, EltSize, LaneOffset);
5488 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
5489 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
5490 DAG.getVectorIdxConstant(0, DL));
5491}
5492
5493SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5494 SelectionDAG &DAG) const {
5495 EVT OpVT = Op.getValueType();
5496 EVT ArgVT = Op.getOperand(0).getValueType();
5497
5499 return LowerFixedLengthBitcastToSVE(Op, DAG);
5500
5501 if (OpVT.isScalableVector()) {
5502 assert(isTypeLegal(OpVT) && "Unexpected result type!");
5503
5504 // Handle type legalisation first.
5505 if (!isTypeLegal(ArgVT)) {
5506 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5507 "Expected int->fp bitcast!");
5508
5509 // Bitcasting between unpacked vector types of different element counts is
5510 // not a NOP because the live elements are laid out differently.
5511 // 01234567
5512 // e.g. nxv2i32 = XX??XX??
5513 // nxv4f16 = X?X?X?X?
5514 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5515 return SDValue();
5516
5517 SDValue ExtResult =
5518 DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
5519 Op.getOperand(0));
5520 return getSVESafeBitCast(OpVT, ExtResult, DAG);
5521 }
5522
5523 // Bitcasts between legal types with the same element count are legal.
5524 if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5525 return Op;
5526
5527 // getSVESafeBitCast does not support casting between unpacked types.
5528 if (!isPackedVectorType(OpVT, DAG))
5529 return SDValue();
5530
5531 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
5532 }
5533
5534 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5535 return SDValue();
5536
5537 // Bitcasts between f16 and bf16 are legal.
5538 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5539 return Op;
5540
5541 assert(ArgVT == MVT::i16);
5542 SDLoc DL(Op);
5543
5544 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
5545 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
5546 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
5547}
5548
5549// Returns lane if Op extracts from a two-element vector and lane is constant
5550// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5551static std::optional<uint64_t>
5553 SDNode *OpNode = Op.getNode();
5554 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5555 return std::nullopt;
5556
5557 EVT VT = OpNode->getOperand(0).getValueType();
5559 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5560 return std::nullopt;
5561
5562 return C->getZExtValue();
5563}
5564
5566 bool isSigned) {
5567 EVT VT = N.getValueType();
5568
5569 if (N.getOpcode() != ISD::BUILD_VECTOR)
5570 return false;
5571
5572 for (const SDValue &Elt : N->op_values()) {
5574 unsigned EltSize = VT.getScalarSizeInBits();
5575 unsigned HalfSize = EltSize / 2;
5576 if (isSigned) {
5577 if (!isIntN(HalfSize, C->getSExtValue()))
5578 return false;
5579 } else {
5580 if (!isUIntN(HalfSize, C->getZExtValue()))
5581 return false;
5582 }
5583 continue;
5584 }
5585 return false;
5586 }
5587
5588 return true;
5589}
5590
5592 EVT VT = N.getValueType();
5593 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5594 EVT HalfVT = EVT::getVectorVT(
5595 *DAG.getContext(),
5598 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
5599}
5600
5602 return N.getOpcode() == ISD::SIGN_EXTEND ||
5603 N.getOpcode() == ISD::ANY_EXTEND ||
5604 isExtendedBUILD_VECTOR(N, DAG, true);
5605}
5606
5608 return N.getOpcode() == ISD::ZERO_EXTEND ||
5609 N.getOpcode() == ISD::ANY_EXTEND ||
5610 isExtendedBUILD_VECTOR(N, DAG, false);
5611}
5612
5614 unsigned Opcode = N.getOpcode();
5615 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5616 SDValue N0 = N.getOperand(0);
5617 SDValue N1 = N.getOperand(1);
5618 return N0->hasOneUse() && N1->hasOneUse() &&
5619 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5620 }
5621 return false;
5622}
5623
5625 unsigned Opcode = N.getOpcode();
5626 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5627 SDValue N0 = N.getOperand(0);
5628 SDValue N1 = N.getOperand(1);
5629 return N0->hasOneUse() && N1->hasOneUse() &&
5630 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5631 }
5632 return false;
5633}
5634
5635SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5636 SelectionDAG &DAG) const {
5637 // The rounding mode is in bits 23:22 of the FPSCR.
5638 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5639 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5640 // so that the shift + and get folded into a bitfield extract.
5641 SDLoc DL(Op);
5642
5643 SDValue Chain = Op.getOperand(0);
5644 SDValue FPCR_64 =
5645 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other},
5646 {Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL,
5647 MVT::i64)});
5648 Chain = FPCR_64.getValue(1);
5649 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR_64);
5650 SDValue FltRounds = DAG.getNode(ISD::ADD, DL, MVT::i32, FPCR_32,
5651 DAG.getConstant(1U << 22, DL, MVT::i32));
5652 SDValue RMODE = DAG.getNode(ISD::SRL, DL, MVT::i32, FltRounds,
5653 DAG.getConstant(22, DL, MVT::i32));
5654 SDValue AND = DAG.getNode(ISD::AND, DL, MVT::i32, RMODE,
5655 DAG.getConstant(3, DL, MVT::i32));
5656 return DAG.getMergeValues({AND, Chain}, DL);
5657}
5658
5659SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5660 SelectionDAG &DAG) const {
5661 SDLoc DL(Op);
5662 SDValue Chain = Op->getOperand(0);
5663 SDValue RMValue = Op->getOperand(1);
5664
5665 // The rounding mode is in bits 23:22 of the FPCR.
5666 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5667 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5668 // ((arg - 1) & 3) << 22).
5669 //
5670 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5671 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5672 // generated llvm.set.rounding to ensure this condition.
5673
5674 // Calculate new value of FPCR[23:22].
5675 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5676 DAG.getConstant(1, DL, MVT::i32));
5677 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5678 DAG.getConstant(0x3, DL, MVT::i32));
5679 RMValue =
5680 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5681 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5682 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5683
5684 // Get current value of FPCR.
5685 SDValue Ops[] = {
5686 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5687 SDValue FPCR =
5688 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5689 Chain = FPCR.getValue(1);
5690 FPCR = FPCR.getValue(0);
5691
5692 // Put new rounding mode into FPSCR[23:22].
5693 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5694 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5695 DAG.getConstant(RMMask, DL, MVT::i64));
5696 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5697 SDValue Ops2[] = {
5698 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5699 FPCR};
5700 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5701}
5702
5703SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5704 SelectionDAG &DAG) const {
5705 SDLoc DL(Op);
5706 SDValue Chain = Op->getOperand(0);
5707
5708 // Get current value of FPCR.
5709 SDValue Ops[] = {
5710 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5711 SDValue FPCR =
5712 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5713 Chain = FPCR.getValue(1);
5714 FPCR = FPCR.getValue(0);
5715
5716 // Truncate FPCR to 32 bits.
5717 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5718
5719 return DAG.getMergeValues({Result, Chain}, DL);
5720}
5721
5722SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5723 SelectionDAG &DAG) const {
5724 SDLoc DL(Op);
5725 SDValue Chain = Op->getOperand(0);
5726 SDValue Mode = Op->getOperand(1);
5727
5728 // Extend the specified value to 64 bits.
5729 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5730
5731 // Set new value of FPCR.
5732 SDValue Ops2[] = {
5733 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5734 FPCR};
5735 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5736}
5737
5738SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5739 SelectionDAG &DAG) const {
5740 SDLoc DL(Op);
5741 SDValue Chain = Op->getOperand(0);
5742
5743 // Get current value of FPCR.
5744 SDValue Ops[] = {
5745 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5746 SDValue FPCR =
5747 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5748 Chain = FPCR.getValue(1);
5749 FPCR = FPCR.getValue(0);
5750
5751 // Clear bits that are not reserved.
5752 SDValue FPSCRMasked = DAG.getNode(
5753 ISD::AND, DL, MVT::i64, FPCR,
5755
5756 // Set new value of FPCR.
5757 SDValue Ops2[] = {
5758 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5759 FPSCRMasked};
5760 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5761}
5762
5763static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5764 SDLoc DL, bool &IsMLA) {
5765 bool IsN0SExt = isSignExtended(N0, DAG);
5766 bool IsN1SExt = isSignExtended(N1, DAG);
5767 if (IsN0SExt && IsN1SExt)
5768 return AArch64ISD::SMULL;
5769
5770 bool IsN0ZExt = isZeroExtended(N0, DAG);
5771 bool IsN1ZExt = isZeroExtended(N1, DAG);
5772
5773 if (IsN0ZExt && IsN1ZExt)
5774 return AArch64ISD::UMULL;
5775
5776 // Select UMULL if we can replace the other operand with an extend.
5777 EVT VT = N0.getValueType();
5778 unsigned EltSize = VT.getScalarSizeInBits();
5779 APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
5780 if (IsN0ZExt || IsN1ZExt) {
5781 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5782 return AArch64ISD::UMULL;
5783 } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
5784 DAG.MaskedValueIsZero(N1, Mask)) {
5785 // For v2i64 we look more aggressively at both operands being zero, to avoid
5786 // scalarization.
5787 return AArch64ISD::UMULL;
5788 }
5789
5790 if (IsN0SExt || IsN1SExt) {
5791 if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
5792 return AArch64ISD::SMULL;
5793 } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
5794 DAG.ComputeNumSignBits(N1) > EltSize / 2) {
5795 return AArch64ISD::SMULL;
5796 }
5797
5798 if (!IsN1SExt && !IsN1ZExt)
5799 return 0;
5800
5801 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5802 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5803 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5804 IsMLA = true;
5805 return AArch64ISD::SMULL;
5806 }
5807 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5808 IsMLA = true;
5809 return AArch64ISD::UMULL;
5810 }
5811 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5812 std::swap(N0, N1);
5813 IsMLA = true;
5814 return AArch64ISD::UMULL;
5815 }
5816 return 0;
5817}
5818
5819SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5820 EVT VT = Op.getValueType();
5821
5822 bool OverrideNEON = !Subtarget->isNeonAvailable();
5823 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5824 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5825
5826 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5827 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5828 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5829 "unexpected type for custom-lowering ISD::MUL");
5830 SDValue N0 = Op.getOperand(0);
5831 SDValue N1 = Op.getOperand(1);
5832 bool isMLA = false;
5833 EVT OVT = VT;
5834 if (VT.is64BitVector()) {
5835 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5836 isNullConstant(N0.getOperand(1)) &&
5839 isNullConstant(N1.getOperand(1)) &&
5841 N0 = N0.getOperand(0);
5842 N1 = N1.getOperand(0);
5843 VT = N0.getValueType();
5844 } else {
5845 if (VT == MVT::v1i64) {
5846 if (Subtarget->hasSVE())
5847 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5848 // Fall through to expand this. It is not legal.
5849 return SDValue();
5850 } else
5851 // Other vector multiplications are legal.
5852 return Op;
5853 }
5854 }
5855
5856 SDLoc DL(Op);
5857 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5858
5859 if (!NewOpc) {
5860 if (VT.getVectorElementType() == MVT::i64) {
5861 // If SVE is available then i64 vector multiplications can also be made
5862 // legal.
5863 if (Subtarget->hasSVE())
5864 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5865 // Fall through to expand this. It is not legal.
5866 return SDValue();
5867 } else
5868 // Other vector multiplications are legal.
5869 return Op;
5870 }
5871
5872 // Legalize to a S/UMULL instruction
5873 SDValue Op0;
5874 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5875 if (!isMLA) {
5876 Op0 = skipExtensionForVectorMULL(N0, DAG);
5878 Op1.getValueType().is64BitVector() &&
5879 "unexpected types for extended operands to VMULL");
5880 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5881 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5882 DAG.getConstant(0, DL, MVT::i64));
5883 }
5884 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5885 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5886 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5889 EVT Op1VT = Op1.getValueType();
5890 return DAG.getNode(
5892 DAG.getNode(N0.getOpcode(), DL, VT,
5893 DAG.getNode(NewOpc, DL, VT,
5894 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5895 DAG.getNode(NewOpc, DL, VT,
5896 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5897 DAG.getConstant(0, DL, MVT::i64));
5898}
5899
5900static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5901 int Pattern) {
5902 if (Pattern == AArch64SVEPredPattern::all)
5903 return DAG.getConstant(1, DL, VT);
5904 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5905 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5906}
5907
5909 bool IsSigned, bool IsEqual) {
5910 unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 1 : 0;
5911 unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 2 : 1;
5912
5913 if (!N->getValueType(0).isScalableVector() ||
5914 !isa<ConstantSDNode>(N->getOperand(Op1)))
5915 return SDValue();
5916
5917 SDLoc DL(N);
5918 APInt Y = N->getConstantOperandAPInt(Op1);
5919
5920 // When the second operand is the maximum value, comparisons that include
5921 // equality can never fail and thus we can return an all active predicate.
5922 if (IsEqual)
5923 if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
5924 return DAG.getConstant(1, DL, N->getValueType(0));
5925
5926 if (!isa<ConstantSDNode>(N->getOperand(Op0)))
5927 return SDValue();
5928
5929 APInt X = N->getConstantOperandAPInt(Op0);
5930
5931 bool Overflow;
5932 APInt NumActiveElems =
5933 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5934
5935 if (Overflow)
5936 return SDValue();
5937
5938 if (IsEqual) {
5939 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5940 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5941 : NumActiveElems.uadd_ov(One, Overflow);
5942 if (Overflow)
5943 return SDValue();
5944 }
5945
5946 std::optional<unsigned> PredPattern =
5948 unsigned MinSVEVectorSize = std::max(
5950 unsigned ElementSize = 128 / N->getValueType(0).getVectorMinNumElements();
5951 if (PredPattern != std::nullopt &&
5952 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5953 return getPTrue(DAG, DL, N->getValueType(0), *PredPattern);
5954
5955 return SDValue();
5956}
5957
5958// Returns a safe bitcast between two scalable vector predicates, where
5959// any newly created lanes from a widening bitcast are defined as zero.
5961 SDLoc DL(Op);
5962 EVT InVT = Op.getValueType();
5963
5964 assert(InVT.getVectorElementType() == MVT::i1 &&
5965 VT.getVectorElementType() == MVT::i1 &&
5966 "Expected a predicate-to-predicate bitcast");
5968 InVT.isScalableVector() &&
5969 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5970 "Only expect to cast between legal scalable predicate types!");
5971
5972 // Return the operand if the cast isn't changing type,
5973 if (InVT == VT)
5974 return Op;
5975
5976 // Look through casts to <vscale x 16 x i1> when their input has more lanes
5977 // than VT. This will increase the chances of removing casts that introduce
5978 // new lanes, which have to be explicitly zero'd.
5979 if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
5980 Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&
5981 Op.getOperand(1).getValueType().bitsGT(VT))
5982 Op = Op.getOperand(1);
5983
5984 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5985
5986 // We only have to zero the lanes if new lanes are being defined, e.g. when
5987 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5988 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5989 // we can return here.
5990 if (InVT.bitsGT(VT))
5991 return Reinterpret;
5992
5993 // Check if the other lanes are already known to be zeroed by
5994 // construction.
5996 return Reinterpret;
5997
5998 // Zero the newly introduced lanes.
5999 SDValue Mask = DAG.getConstant(1, DL, InVT);
6000 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
6001 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
6002}
6003
6004SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
6005 SDValue Chain, SDLoc DL,
6006 EVT VT) const {
6007 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
6010 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
6011 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
6012 TargetLowering::CallLoweringInfo CLI(DAG);
6014 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
6015 getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
6016 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
6017 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
6018 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
6019 Mask);
6020}
6021
6022// Lower an SME LDR/STR ZA intrinsic
6023// Case 1: If the vector number (vecnum) is an immediate in range, it gets
6024// folded into the instruction
6025// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
6026// Case 2: If the vecnum is not an immediate, then it is used to modify the base
6027// and tile slice registers
6028// ldr(%tileslice, %ptr, %vecnum)
6029// ->
6030// %svl = rdsvl
6031// %ptr2 = %ptr + %svl * %vecnum
6032// %tileslice2 = %tileslice + %vecnum
6033// ldr [%tileslice2, 0], [%ptr2, 0]
6034// Case 3: If the vecnum is an immediate out of range, then the same is done as
6035// case 2, but the base and slice registers are modified by the greatest
6036// multiple of 15 lower than the vecnum and the remainder is folded into the
6037// instruction. This means that successive loads and stores that are offset from
6038// each other can share the same base and slice register updates.
6039// ldr(%tileslice, %ptr, 22)
6040// ldr(%tileslice, %ptr, 23)
6041// ->
6042// %svl = rdsvl
6043// %ptr2 = %ptr + %svl * 15
6044// %tileslice2 = %tileslice + 15
6045// ldr [%tileslice2, 7], [%ptr2, 7]
6046// ldr [%tileslice2, 8], [%ptr2, 8]
6047// Case 4: If the vecnum is an add of an immediate, then the non-immediate
6048// operand and the immediate can be folded into the instruction, like case 2.
6049// ldr(%tileslice, %ptr, %vecnum + 7)
6050// ldr(%tileslice, %ptr, %vecnum + 8)
6051// ->
6052// %svl = rdsvl
6053// %ptr2 = %ptr + %svl * %vecnum
6054// %tileslice2 = %tileslice + %vecnum
6055// ldr [%tileslice2, 7], [%ptr2, 7]
6056// ldr [%tileslice2, 8], [%ptr2, 8]
6057// Case 5: The vecnum being an add of an immediate out of range is also handled,
6058// in which case the same remainder logic as case 3 is used.
6060 SDLoc DL(N);
6061
6062 SDValue TileSlice = N->getOperand(2);
6063 SDValue Base = N->getOperand(3);
6064 SDValue VecNum = N->getOperand(4);
6065 int32_t ConstAddend = 0;
6066 SDValue VarAddend = VecNum;
6067
6068 // If the vnum is an add of an immediate, we can fold it into the instruction
6069 if (VecNum.getOpcode() == ISD::ADD &&
6070 isa<ConstantSDNode>(VecNum.getOperand(1))) {
6071 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
6072 VarAddend = VecNum.getOperand(0);
6073 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
6074 ConstAddend = ImmNode->getSExtValue();
6075 VarAddend = SDValue();
6076 }
6077
6078 int32_t ImmAddend = ConstAddend % 16;
6079 if (int32_t C = (ConstAddend - ImmAddend)) {
6080 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
6081 VarAddend = VarAddend
6082 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
6083 : CVal;
6084 }
6085
6086 if (VarAddend) {
6087 // Get the vector length that will be multiplied by vnum
6088 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6089 DAG.getConstant(1, DL, MVT::i32));
6090
6091 // Multiply SVL and vnum then add it to the base
6092 SDValue Mul = DAG.getNode(
6093 ISD::MUL, DL, MVT::i64,
6094 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
6095 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
6096 // Just add vnum to the tileslice
6097 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
6098 }
6099
6100 return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
6101 DL, MVT::Other,
6102 {/*Chain=*/N.getOperand(0), TileSlice, Base,
6103 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
6104}
6105
6107 SDLoc DL(Op);
6108 SDValue ID =
6109 DAG.getTargetConstant(Intrinsic::aarch64_sve_match, DL, MVT::i64);
6110
6111 auto Op1 = Op.getOperand(1);
6112 auto Op2 = Op.getOperand(2);
6113 auto Mask = Op.getOperand(3);
6114
6115 EVT Op1VT = Op1.getValueType();
6116 EVT Op2VT = Op2.getValueType();
6117 EVT ResVT = Op.getValueType();
6118
6119 assert((Op1VT.getVectorElementType() == MVT::i8 ||
6120 Op1VT.getVectorElementType() == MVT::i16) &&
6121 "Expected 8-bit or 16-bit characters.");
6122
6123 // Scalable vector type used to wrap operands.
6124 // A single container is enough for both operands because ultimately the
6125 // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
6126 EVT OpContainerVT = Op1VT.isScalableVector()
6127 ? Op1VT
6129
6130 if (Op2VT.is128BitVector()) {
6131 // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
6132 Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
6133 // Further, if the result is scalable, broadcast Op2 to a full SVE register.
6134 if (ResVT.isScalableVector())
6135 Op2 = DAG.getNode(AArch64ISD::DUPLANE128, DL, OpContainerVT, Op2,
6136 DAG.getTargetConstant(0, DL, MVT::i64));
6137 } else {
6138 // If Op2 is not a full 128-bit vector, we always need to broadcast it.
6139 unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
6140 MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
6141 EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
6142 Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
6143 Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op2IntVT, Op2,
6144 DAG.getConstant(0, DL, MVT::i64));
6145 Op2 = DAG.getSplatVector(Op2PromotedVT, DL, Op2);
6146 Op2 = DAG.getBitcast(OpContainerVT, Op2);
6147 }
6148
6149 // If the result is scalable, we just need to carry out the MATCH.
6150 if (ResVT.isScalableVector())
6151 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResVT, ID, Mask, Op1, Op2);
6152
6153 // If the result is fixed, we can still use MATCH but we need to wrap the
6154 // first operand and the mask in scalable vectors before doing so.
6155
6156 // Wrap the operands.
6157 Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
6158 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, Op1VT, Mask);
6159 Mask = convertFixedMaskToScalableVector(Mask, DAG);
6160
6161 // Carry out the match.
6162 SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Mask.getValueType(),
6163 ID, Mask, Op1, Op2);
6164
6165 // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
6166 // (v16i8/v8i8).
6167 Match = DAG.getNode(ISD::SIGN_EXTEND, DL, OpContainerVT, Match);
6168 Match = convertFromScalableVector(DAG, Op1VT, Match);
6169 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Match);
6170}
6171
6172SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6173 SelectionDAG &DAG) const {
6174 unsigned IntNo = Op.getConstantOperandVal(1);
6175 SDLoc DL(Op);
6176 switch (IntNo) {
6177 default:
6178 return SDValue(); // Don't custom lower most intrinsics.
6179 case Intrinsic::aarch64_prefetch: {
6180 SDValue Chain = Op.getOperand(0);
6181 SDValue Addr = Op.getOperand(2);
6182
6183 unsigned IsWrite = Op.getConstantOperandVal(3);
6184 unsigned Locality = Op.getConstantOperandVal(4);
6185 unsigned IsStream = Op.getConstantOperandVal(5);
6186 unsigned IsData = Op.getConstantOperandVal(6);
6187 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
6188 (!IsData << 3) | // IsDataCache bit
6189 (Locality << 1) | // Cache level bits
6190 (unsigned)IsStream; // Stream bit
6191
6192 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
6193 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
6194 }
6195 case Intrinsic::aarch64_sme_str:
6196 case Intrinsic::aarch64_sme_ldr: {
6197 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
6198 }
6199 case Intrinsic::aarch64_sme_za_enable:
6200 return DAG.getNode(
6201 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6202 Op->getOperand(0), // Chain
6203 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6204 case Intrinsic::aarch64_sme_za_disable:
6205 return DAG.getNode(
6206 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6207 Op->getOperand(0), // Chain
6208 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6209 }
6210}
6211
6212SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
6213 SelectionDAG &DAG) const {
6214 unsigned IntNo = Op.getConstantOperandVal(1);
6215 SDLoc DL(Op);
6216 switch (IntNo) {
6217 default:
6218 return SDValue(); // Don't custom lower most intrinsics.
6219 case Intrinsic::aarch64_mops_memset_tag: {
6220 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
6221 SDValue Chain = Node->getChain();
6222 SDValue Dst = Op.getOperand(2);
6223 SDValue Val = Op.getOperand(3);
6224 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
6225 SDValue Size = Op.getOperand(4);
6226 auto Alignment = Node->getMemOperand()->getAlign();
6227 bool IsVol = Node->isVolatile();
6228 auto DstPtrInfo = Node->getPointerInfo();
6229
6230 const auto &SDI =
6231 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
6232 SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
6233 Chain, Dst, Val, Size, Alignment, IsVol,
6234 DstPtrInfo, MachinePointerInfo{});
6235
6236 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
6237 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
6238 // LowerOperationWrapper will complain that the number of results has
6239 // changed.
6240 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
6241 }
6242 }
6243}
6244
6245SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6246 SelectionDAG &DAG) const {
6247 unsigned IntNo = Op.getConstantOperandVal(0);
6248 SDLoc DL(Op);
6249 switch (IntNo) {
6250 default: return SDValue(); // Don't custom lower most intrinsics.
6251 case Intrinsic::thread_pointer: {
6252 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6253 return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
6254 }
6255 case Intrinsic::aarch64_sve_whilewr_b:
6256 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6257 Op.getOperand(1), Op.getOperand(2),
6258 DAG.getConstant(1, DL, MVT::i64),
6259 DAG.getConstant(0, DL, MVT::i64));
6260 case Intrinsic::aarch64_sve_whilewr_h:
6261 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6262 Op.getOperand(1), Op.getOperand(2),
6263 DAG.getConstant(2, DL, MVT::i64),
6264 DAG.getConstant(0, DL, MVT::i64));
6265 case Intrinsic::aarch64_sve_whilewr_s:
6266 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6267 Op.getOperand(1), Op.getOperand(2),
6268 DAG.getConstant(4, DL, MVT::i64),
6269 DAG.getConstant(0, DL, MVT::i64));
6270 case Intrinsic::aarch64_sve_whilewr_d:
6271 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6272 Op.getOperand(1), Op.getOperand(2),
6273 DAG.getConstant(8, DL, MVT::i64),
6274 DAG.getConstant(0, DL, MVT::i64));
6275 case Intrinsic::aarch64_sve_whilerw_b:
6276 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6277 Op.getOperand(1), Op.getOperand(2),
6278 DAG.getConstant(1, DL, MVT::i64),
6279 DAG.getConstant(0, DL, MVT::i64));
6280 case Intrinsic::aarch64_sve_whilerw_h:
6281 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6282 Op.getOperand(1), Op.getOperand(2),
6283 DAG.getConstant(2, DL, MVT::i64),
6284 DAG.getConstant(0, DL, MVT::i64));
6285 case Intrinsic::aarch64_sve_whilerw_s:
6286 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6287 Op.getOperand(1), Op.getOperand(2),
6288 DAG.getConstant(4, DL, MVT::i64),
6289 DAG.getConstant(0, DL, MVT::i64));
6290 case Intrinsic::aarch64_sve_whilerw_d:
6291 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6292 Op.getOperand(1), Op.getOperand(2),
6293 DAG.getConstant(8, DL, MVT::i64),
6294 DAG.getConstant(0, DL, MVT::i64));
6295 case Intrinsic::aarch64_neon_abs: {
6296 EVT Ty = Op.getValueType();
6297 if (Ty == MVT::i64) {
6298 SDValue Result =
6299 DAG.getNode(ISD::BITCAST, DL, MVT::v1i64, Op.getOperand(1));
6300 Result = DAG.getNode(ISD::ABS, DL, MVT::v1i64, Result);
6301 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Result);
6302 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
6303 return DAG.getNode(ISD::ABS, DL, Ty, Op.getOperand(1));
6304 } else {
6305 report_fatal_error("Unexpected type for AArch64 NEON intrinsic");
6306 }
6307 }
6308 case Intrinsic::aarch64_neon_pmull64: {
6309 SDValue LHS = Op.getOperand(1);
6310 SDValue RHS = Op.getOperand(2);
6311
6312 std::optional<uint64_t> LHSLane =
6314 std::optional<uint64_t> RHSLane =
6316
6317 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
6318 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
6319
6320 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
6321 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
6322 // which ISel recognizes better. For example, generate a ldr into d*
6323 // registers as opposed to a GPR load followed by a fmov.
6324 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
6325 std::optional<uint64_t> OtherLane,
6326 const SDLoc &DL,
6327 SelectionDAG &DAG) -> SDValue {
6328 // If the operand is an higher half itself, rewrite it to
6329 // extract_high_v2i64; this way aarch64_neon_pmull64 could
6330 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
6331 if (NLane == 1)
6332 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6333 N.getOperand(0), DAG.getConstant(1, DL, MVT::i64));
6334
6335 // Operand N is not a higher half but the other operand is.
6336 if (OtherLane == 1) {
6337 // If this operand is a lower half, rewrite it to
6338 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
6339 // align lanes of two operands. A roundtrip sequence (to move from lane
6340 // 1 to lane 0) is like this:
6341 // mov x8, v0.d[1]
6342 // fmov d0, x8
6343 if (NLane == 0)
6344 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6345 DAG.getNode(AArch64ISD::DUPLANE64, DL, MVT::v2i64,
6346 N.getOperand(0),
6347 DAG.getConstant(0, DL, MVT::i64)),
6348 DAG.getConstant(1, DL, MVT::i64));
6349
6350 // Otherwise just dup from main to all lanes.
6351 return DAG.getNode(AArch64ISD::DUP, DL, MVT::v1i64, N);
6352 }
6353
6354 // Neither operand is an extract of higher half, so codegen may just use
6355 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
6356 assert(N.getValueType() == MVT::i64 &&
6357 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
6358 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, N);
6359 };
6360
6361 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, DL, DAG);
6362 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, DL, DAG);
6363
6364 return DAG.getNode(AArch64ISD::PMULL, DL, Op.getValueType(), LHS, RHS);
6365 }
6366 case Intrinsic::aarch64_neon_smax:
6367 return DAG.getNode(ISD::SMAX, DL, Op.getValueType(), Op.getOperand(1),
6368 Op.getOperand(2));
6369 case Intrinsic::aarch64_neon_umax:
6370 return DAG.getNode(ISD::UMAX, DL, Op.getValueType(), Op.getOperand(1),
6371 Op.getOperand(2));
6372 case Intrinsic::aarch64_neon_smin:
6373 return DAG.getNode(ISD::SMIN, DL, Op.getValueType(), Op.getOperand(1),
6374 Op.getOperand(2));
6375 case Intrinsic::aarch64_neon_umin:
6376 return DAG.getNode(ISD::UMIN, DL, Op.getValueType(), Op.getOperand(1),
6377 Op.getOperand(2));
6378 case Intrinsic::aarch64_neon_scalar_sqxtn:
6379 case Intrinsic::aarch64_neon_scalar_sqxtun:
6380 case Intrinsic::aarch64_neon_scalar_uqxtn: {
6381 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
6382 if (Op.getValueType() == MVT::i32)
6383 return DAG.getNode(ISD::BITCAST, DL, MVT::i32,
6384 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
6385 Op.getOperand(0),
6386 DAG.getNode(ISD::BITCAST, DL, MVT::f64,
6387 Op.getOperand(1))));
6388 return SDValue();
6389 }
6390 case Intrinsic::aarch64_neon_sqxtn:
6391 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6392 Op.getOperand(1));
6393 case Intrinsic::aarch64_neon_sqxtun:
6394 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6395 Op.getOperand(1));
6396 case Intrinsic::aarch64_neon_uqxtn:
6397 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6398 Op.getOperand(1));
6399 case Intrinsic::aarch64_neon_sqshrn:
6400 if (Op.getValueType().isVector())
6401 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6402 DAG.getNode(AArch64ISD::VASHR, DL,
6403 Op.getOperand(1).getValueType(),
6404 Op.getOperand(1), Op.getOperand(2)));
6405 return SDValue();
6406 case Intrinsic::aarch64_neon_sqshrun:
6407 if (Op.getValueType().isVector())
6408 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6409 DAG.getNode(AArch64ISD::VASHR, DL,
6410 Op.getOperand(1).getValueType(),
6411 Op.getOperand(1), Op.getOperand(2)));
6412 return SDValue();
6413 case Intrinsic::aarch64_neon_uqshrn:
6414 if (Op.getValueType().isVector())
6415 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6416 DAG.getNode(AArch64ISD::VLSHR, DL,
6417 Op.getOperand(1).getValueType(),
6418 Op.getOperand(1), Op.getOperand(2)));
6419 return SDValue();
6420 case Intrinsic::aarch64_neon_sqrshrn:
6421 if (Op.getValueType().isVector())
6422 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6423 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6424 Op.getOperand(1).getValueType(),
6425 Op.getOperand(1), Op.getOperand(2)));
6426 return SDValue();
6427 case Intrinsic::aarch64_neon_sqrshrun:
6428 if (Op.getValueType().isVector())
6429 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6430 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6431 Op.getOperand(1).getValueType(),
6432 Op.getOperand(1), Op.getOperand(2)));
6433 return SDValue();
6434 case Intrinsic::aarch64_neon_uqrshrn:
6435 if (Op.getValueType().isVector())
6436 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6437 DAG.getNode(AArch64ISD::URSHR_I, DL,
6438 Op.getOperand(1).getValueType(),
6439 Op.getOperand(1), Op.getOperand(2)));
6440 return SDValue();
6441 case Intrinsic::aarch64_neon_sqrshl:
6442 if (Op.getValueType().isVector())
6443 return SDValue();
6444 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHL, DAG);
6445 case Intrinsic::aarch64_neon_sqshl:
6446 if (Op.getValueType().isVector())
6447 return SDValue();
6448 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHL, DAG);
6449 case Intrinsic::aarch64_neon_uqrshl:
6450 if (Op.getValueType().isVector())
6451 return SDValue();
6452 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQRSHL, DAG);
6453 case Intrinsic::aarch64_neon_uqshl:
6454 if (Op.getValueType().isVector())
6455 return SDValue();
6456 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSHL, DAG);
6457 case Intrinsic::aarch64_neon_sqadd:
6458 if (Op.getValueType().isVector())
6459 return DAG.getNode(ISD::SADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6460 Op.getOperand(2));
6461 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQADD, DAG);
6462
6463 case Intrinsic::aarch64_neon_sqsub:
6464 if (Op.getValueType().isVector())
6465 return DAG.getNode(ISD::SSUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6466 Op.getOperand(2));
6467 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSUB, DAG);
6468
6469 case Intrinsic::aarch64_neon_uqadd:
6470 if (Op.getValueType().isVector())
6471 return DAG.getNode(ISD::UADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6472 Op.getOperand(2));
6473 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQADD, DAG);
6474 case Intrinsic::aarch64_neon_uqsub:
6475 if (Op.getValueType().isVector())
6476 return DAG.getNode(ISD::USUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6477 Op.getOperand(2));
6478 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSUB, DAG);
6479 case Intrinsic::aarch64_neon_sqdmulls_scalar:
6480 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQDMULL, DAG);
6481 case Intrinsic::aarch64_sve_whilelt:
6482 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6483 /*IsEqual=*/false);
6484 case Intrinsic::aarch64_sve_whilels:
6485 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/false,
6486 /*IsEqual=*/true);
6487 case Intrinsic::aarch64_sve_whilele:
6488 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6489 /*IsEqual=*/true);
6490 case Intrinsic::aarch64_sve_sunpkhi:
6491 return DAG.getNode(AArch64ISD::SUNPKHI, DL, Op.getValueType(),
6492 Op.getOperand(1));
6493 case Intrinsic::aarch64_sve_sunpklo:
6494 return DAG.getNode(AArch64ISD::SUNPKLO, DL, Op.getValueType(),
6495 Op.getOperand(1));
6496 case Intrinsic::aarch64_sve_uunpkhi:
6497 return DAG.getNode(AArch64ISD::UUNPKHI, DL, Op.getValueType(),
6498 Op.getOperand(1));
6499 case Intrinsic::aarch64_sve_uunpklo:
6500 return DAG.getNode(AArch64ISD::UUNPKLO, DL, Op.getValueType(),
6501 Op.getOperand(1));
6502 case Intrinsic::aarch64_sve_clasta_n:
6503 return DAG.getNode(AArch64ISD::CLASTA_N, DL, Op.getValueType(),
6504 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6505 case Intrinsic::aarch64_sve_clastb_n:
6506 return DAG.getNode(AArch64ISD::CLASTB_N, DL, Op.getValueType(),
6507 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6508 case Intrinsic::aarch64_sve_lasta:
6509 return DAG.getNode(AArch64ISD::LASTA, DL, Op.getValueType(),
6510 Op.getOperand(1), Op.getOperand(2));
6511 case Intrinsic::aarch64_sve_lastb:
6512 return DAG.getNode(AArch64ISD::LASTB, DL, Op.getValueType(),
6513 Op.getOperand(1), Op.getOperand(2));
6514 case Intrinsic::aarch64_sve_tbl:
6515 return DAG.getNode(AArch64ISD::TBL, DL, Op.getValueType(), Op.getOperand(1),
6516 Op.getOperand(2));
6517 case Intrinsic::aarch64_sve_trn1:
6518 return DAG.getNode(AArch64ISD::TRN1, DL, Op.getValueType(),
6519 Op.getOperand(1), Op.getOperand(2));
6520 case Intrinsic::aarch64_sve_trn2:
6521 return DAG.getNode(AArch64ISD::TRN2, DL, Op.getValueType(),
6522 Op.getOperand(1), Op.getOperand(2));
6523 case Intrinsic::aarch64_sve_uzp1:
6524 return DAG.getNode(AArch64ISD::UZP1, DL, Op.getValueType(),
6525 Op.getOperand(1), Op.getOperand(2));
6526 case Intrinsic::aarch64_sve_uzp2:
6527 return DAG.getNode(AArch64ISD::UZP2, DL, Op.getValueType(),
6528 Op.getOperand(1), Op.getOperand(2));
6529 case Intrinsic::aarch64_sve_zip1:
6530 return DAG.getNode(AArch64ISD::ZIP1, DL, Op.getValueType(),
6531 Op.getOperand(1), Op.getOperand(2));
6532 case Intrinsic::aarch64_sve_zip2:
6533 return DAG.getNode(AArch64ISD::ZIP2, DL, Op.getValueType(),
6534 Op.getOperand(1), Op.getOperand(2));
6535 case Intrinsic::aarch64_sve_splice:
6536 return DAG.getNode(AArch64ISD::SPLICE, DL, Op.getValueType(),
6537 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6538 case Intrinsic::aarch64_sve_ptrue:
6539 return getPTrue(DAG, DL, Op.getValueType(), Op.getConstantOperandVal(1));
6540 case Intrinsic::aarch64_sve_clz:
6541 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, DL, Op.getValueType(),
6542 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6543 case Intrinsic::aarch64_sme_cntsd: {
6544 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
6545 DAG.getConstant(1, DL, MVT::i32));
6546 return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes,
6547 DAG.getConstant(3, DL, MVT::i32), SDNodeFlags::Exact);
6548 }
6549 case Intrinsic::aarch64_sve_cnt: {
6550 SDValue Data = Op.getOperand(3);
6551 // CTPOP only supports integer operands.
6552 if (Data.getValueType().isFloatingPoint())
6553 Data = DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Data);
6554 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, DL, Op.getValueType(),
6555 Op.getOperand(2), Data, Op.getOperand(1));
6556 }
6557 case Intrinsic::aarch64_sve_dupq_lane:
6558 return LowerDUPQLane(Op, DAG);
6559 case Intrinsic::aarch64_sve_convert_from_svbool:
6560 if (Op.getValueType() == MVT::aarch64svcount)
6561 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Op.getOperand(1));
6562 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
6563 case Intrinsic::aarch64_sve_convert_to_svbool:
6564 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
6565 return DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, Op.getOperand(1));
6566 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
6567 case Intrinsic::aarch64_sve_fneg:
6568 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6569 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6570 case Intrinsic::aarch64_sve_frintp:
6571 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, DL, Op.getValueType(),
6572 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6573 case Intrinsic::aarch64_sve_frintm:
6574 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL, Op.getValueType(),
6575 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6576 case Intrinsic::aarch64_sve_frinti:
6577 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL,
6578 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6579 Op.getOperand(1));
6580 case Intrinsic::aarch64_sve_frintx:
6581 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, DL, Op.getValueType(),
6582 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6583 case Intrinsic::aarch64_sve_frinta:
6584 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, DL, Op.getValueType(),
6585 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6586 case Intrinsic::aarch64_sve_frintn:
6587 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL,
6588 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6589 Op.getOperand(1));
6590 case Intrinsic::aarch64_sve_frintz:
6591 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL, Op.getValueType(),
6592 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6593 case Intrinsic::aarch64_sve_ucvtf:
6594 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL,
6595 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6596 Op.getOperand(1));
6597 case Intrinsic::aarch64_sve_scvtf:
6598 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL,
6599 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6600 Op.getOperand(1));
6601 case Intrinsic::aarch64_sve_fcvtzu:
6602 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL, Op.getValueType(),
6603 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6604 case Intrinsic::aarch64_sve_fcvtzs:
6605 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL, Op.getValueType(),
6606 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6607 case Intrinsic::aarch64_sve_fsqrt:
6608 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, DL, Op.getValueType(),
6609 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6610 case Intrinsic::aarch64_sve_frecpx:
6611 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, DL, Op.getValueType(),
6612 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6613 case Intrinsic::aarch64_sve_frecpe_x:
6614 return DAG.getNode(AArch64ISD::FRECPE, DL, Op.getValueType(),
6615 Op.getOperand(1));
6616 case Intrinsic::aarch64_sve_frecps_x:
6617 return DAG.getNode(AArch64ISD::FRECPS, DL, Op.getValueType(),
6618 Op.getOperand(1), Op.getOperand(2));
6619 case Intrinsic::aarch64_sve_frsqrte_x:
6620 return DAG.getNode(AArch64ISD::FRSQRTE, DL, Op.getValueType(),
6621 Op.getOperand(1));
6622 case Intrinsic::aarch64_sve_frsqrts_x:
6623 return DAG.getNode(AArch64ISD::FRSQRTS, DL, Op.getValueType(),
6624 Op.getOperand(1), Op.getOperand(2));
6625 case Intrinsic::aarch64_sve_fabs:
6626 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6627 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6628 case Intrinsic::aarch64_sve_abs:
6629 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6630 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6631 case Intrinsic::aarch64_sve_neg:
6632 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6633 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6634 case Intrinsic::aarch64_sve_insr: {
6635 SDValue Scalar = Op.getOperand(2);
6636 EVT ScalarTy = Scalar.getValueType();
6637 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6638 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
6639
6640 return DAG.getNode(AArch64ISD::INSR, DL, Op.getValueType(),
6641 Op.getOperand(1), Scalar);
6642 }
6643 case Intrinsic::aarch64_sve_rbit:
6644 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL,
6645 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6646 Op.getOperand(1));
6647 case Intrinsic::aarch64_sve_revb:
6648 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, DL, Op.getValueType(),
6649 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6650 case Intrinsic::aarch64_sve_revh:
6651 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, DL, Op.getValueType(),
6652 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6653 case Intrinsic::aarch64_sve_revw:
6654 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, DL, Op.getValueType(),
6655 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6656 case Intrinsic::aarch64_sve_revd:
6657 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, Op.getValueType(),
6658 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6659 case Intrinsic::aarch64_sve_sxtb:
6660 return DAG.getNode(
6661 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6662 Op.getOperand(2), Op.getOperand(3),
6663 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6664 Op.getOperand(1));
6665 case Intrinsic::aarch64_sve_sxth:
6666 return DAG.getNode(
6667 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6668 Op.getOperand(2), Op.getOperand(3),
6669 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6670 Op.getOperand(1));
6671 case Intrinsic::aarch64_sve_sxtw:
6672 return DAG.getNode(
6673 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6674 Op.getOperand(2), Op.getOperand(3),
6675 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6676 Op.getOperand(1));
6677 case Intrinsic::aarch64_sve_uxtb:
6678 return DAG.getNode(
6679 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6680 Op.getOperand(2), Op.getOperand(3),
6681 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6682 Op.getOperand(1));
6683 case Intrinsic::aarch64_sve_uxth:
6684 return DAG.getNode(
6685 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6686 Op.getOperand(2), Op.getOperand(3),
6687 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6688 Op.getOperand(1));
6689 case Intrinsic::aarch64_sve_uxtw:
6690 return DAG.getNode(
6691 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6692 Op.getOperand(2), Op.getOperand(3),
6693 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6694 Op.getOperand(1));
6695 case Intrinsic::localaddress: {
6696 const auto &MF = DAG.getMachineFunction();
6697 const auto *RegInfo = Subtarget->getRegisterInfo();
6698 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6699 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg,
6700 Op.getSimpleValueType());
6701 }
6702
6703 case Intrinsic::eh_recoverfp: {
6704 // FIXME: This needs to be implemented to correctly handle highly aligned
6705 // stack objects. For now we simply return the incoming FP. Refer D53541
6706 // for more details.
6707 SDValue FnOp = Op.getOperand(1);
6708 SDValue IncomingFPOp = Op.getOperand(2);
6709 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
6710 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
6711 if (!Fn)
6713 "llvm.eh.recoverfp must take a function as the first argument");
6714 return IncomingFPOp;
6715 }
6716 case Intrinsic::aarch64_neon_vsri:
6717 case Intrinsic::aarch64_neon_vsli:
6718 case Intrinsic::aarch64_sve_sri:
6719 case Intrinsic::aarch64_sve_sli: {
6720 EVT Ty = Op.getValueType();
6721
6722 if (!Ty.isVector())
6723 report_fatal_error("Unexpected type for aarch64_neon_vsli");
6724
6725 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6726
6727 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6728 IntNo == Intrinsic::aarch64_sve_sri;
6729 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6730 return DAG.getNode(Opcode, DL, Ty, Op.getOperand(1), Op.getOperand(2),
6731 Op.getOperand(3));
6732 }
6733
6734 case Intrinsic::aarch64_neon_srhadd:
6735 case Intrinsic::aarch64_neon_urhadd:
6736 case Intrinsic::aarch64_neon_shadd:
6737 case Intrinsic::aarch64_neon_uhadd: {
6738 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6739 IntNo == Intrinsic::aarch64_neon_shadd);
6740 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6741 IntNo == Intrinsic::aarch64_neon_urhadd);
6742 unsigned Opcode = IsSignedAdd
6743 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6744 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6745 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6746 Op.getOperand(2));
6747 }
6748 case Intrinsic::aarch64_neon_saddlp:
6749 case Intrinsic::aarch64_neon_uaddlp: {
6750 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6751 ? AArch64ISD::UADDLP
6752 : AArch64ISD::SADDLP;
6753 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1));
6754 }
6755 case Intrinsic::aarch64_neon_sdot:
6756 case Intrinsic::aarch64_neon_udot:
6757 case Intrinsic::aarch64_sve_sdot:
6758 case Intrinsic::aarch64_sve_udot: {
6759 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6760 IntNo == Intrinsic::aarch64_sve_udot)
6761 ? AArch64ISD::UDOT
6762 : AArch64ISD::SDOT;
6763 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6764 Op.getOperand(2), Op.getOperand(3));
6765 }
6766 case Intrinsic::aarch64_neon_usdot:
6767 case Intrinsic::aarch64_sve_usdot: {
6768 return DAG.getNode(AArch64ISD::USDOT, DL, Op.getValueType(),
6769 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6770 }
6771 case Intrinsic::aarch64_neon_saddlv:
6772 case Intrinsic::aarch64_neon_uaddlv: {
6773 EVT OpVT = Op.getOperand(1).getValueType();
6774 EVT ResVT = Op.getValueType();
6775 assert(
6776 ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6777 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6778 (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6779 "Unexpected aarch64_neon_u/saddlv type");
6780 (void)OpVT;
6781 // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6782 SDValue ADDLV = DAG.getNode(
6783 IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6784 : AArch64ISD::SADDLV,
6785 DL, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6786 SDValue EXTRACT_VEC_ELT = DAG.getNode(
6787 ISD::EXTRACT_VECTOR_ELT, DL, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6788 ADDLV, DAG.getConstant(0, DL, MVT::i64));
6789 return EXTRACT_VEC_ELT;
6790 }
6791 case Intrinsic::experimental_cttz_elts: {
6792 SDValue CttzOp = Op.getOperand(1);
6793 EVT VT = CttzOp.getValueType();
6794 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6795
6796 if (VT.isFixedLengthVector()) {
6797 // We can use SVE instructions to lower this intrinsic by first creating
6798 // an SVE predicate register mask from the fixed-width vector.
6799 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6800 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, CttzOp);
6801 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6802 }
6803
6804 SDValue NewCttzElts =
6805 DAG.getNode(AArch64ISD::CTTZ_ELTS, DL, MVT::i64, CttzOp);
6806 return DAG.getZExtOrTrunc(NewCttzElts, DL, Op.getValueType());
6807 }
6808 case Intrinsic::experimental_vector_match: {
6809 return LowerVectorMatch(Op, DAG);
6810 }
6811 }
6812}
6813
6814bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6815 if (VT.getVectorElementType() == MVT::i8 ||
6816 VT.getVectorElementType() == MVT::i16) {
6817 EltTy = MVT::i32;
6818 return true;
6819 }
6820 return false;
6821}
6822
6823bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6824 EVT DataVT) const {
6825 const EVT IndexVT = Extend.getOperand(0).getValueType();
6826 // SVE only supports implicit extension of 32-bit indices.
6827 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6828 return false;
6829
6830 // Indices cannot be smaller than the main data type.
6831 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6832 return false;
6833
6834 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6835 // element container type, which would violate the previous clause.
6836 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6837}
6838
6839/// Helper function to check if a small vector load can be optimized.
6841 const AArch64Subtarget &Subtarget) {
6842 if (!Subtarget.isNeonAvailable())
6843 return false;
6844 if (LD->isVolatile())
6845 return false;
6846
6847 EVT MemVT = LD->getMemoryVT();
6848 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8 && MemVT != MVT::v2i16)
6849 return false;
6850
6851 Align Alignment = LD->getAlign();
6852 Align RequiredAlignment = Align(MemVT.getStoreSize().getFixedValue());
6853 if (Subtarget.requiresStrictAlign() && Alignment < RequiredAlignment)
6854 return false;
6855
6856 return true;
6857}
6858
6859bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6860 EVT ExtVT = ExtVal.getValueType();
6861 // Small, illegal vectors can be extended inreg.
6862 if (auto *Load = dyn_cast<LoadSDNode>(ExtVal.getOperand(0))) {
6863 if (ExtVT.isFixedLengthVector() && ExtVT.getStoreSizeInBits() <= 128 &&
6864 isEligibleForSmallVectorLoadOpt(Load, *Subtarget))
6865 return true;
6866 }
6867 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6868 return false;
6869
6870 // It may be worth creating extending masked loads if there are multiple
6871 // masked loads using the same predicate. That way we'll end up creating
6872 // extending masked loads that may then get split by the legaliser. This
6873 // results in just one set of predicate unpacks at the start, instead of
6874 // multiple sets of vector unpacks after each load.
6875 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
6876 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
6877 // Disable extending masked loads for fixed-width for now, since the code
6878 // quality doesn't look great.
6879 if (!ExtVT.isScalableVector())
6880 return false;
6881
6882 unsigned NumExtMaskedLoads = 0;
6883 for (auto *U : Ld->getMask()->users())
6884 if (isa<MaskedLoadSDNode>(U))
6885 NumExtMaskedLoads++;
6886
6887 if (NumExtMaskedLoads <= 1)
6888 return false;
6889 }
6890 }
6891
6892 EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType();
6893 return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 ||
6894 PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64;
6895}
6896
6897unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6898 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6899 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
6900 AArch64ISD::GLD1_MERGE_ZERO},
6901 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
6902 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
6903 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
6904 AArch64ISD::GLD1_MERGE_ZERO},
6905 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
6906 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
6907 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
6908 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6909 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
6910 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
6911 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
6912 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6913 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
6914 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
6915 };
6916 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
6917 return AddrModes.find(Key)->second;
6918}
6919
6920unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6921 switch (Opcode) {
6922 default:
6923 llvm_unreachable("unimplemented opcode");
6924 return Opcode;
6925 case AArch64ISD::GLD1_MERGE_ZERO:
6926 return AArch64ISD::GLD1S_MERGE_ZERO;
6927 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
6928 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
6929 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
6930 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
6931 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
6932 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
6933 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
6934 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
6935 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
6936 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
6937 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
6938 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
6939 }
6940}
6941
6942SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6943 SelectionDAG &DAG) const {
6944 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
6945
6946 SDLoc DL(Op);
6947 SDValue Chain = MGT->getChain();
6948 SDValue PassThru = MGT->getPassThru();
6949 SDValue Mask = MGT->getMask();
6950 SDValue BasePtr = MGT->getBasePtr();
6951 SDValue Index = MGT->getIndex();
6952 SDValue Scale = MGT->getScale();
6953 EVT VT = Op.getValueType();
6954 EVT MemVT = MGT->getMemoryVT();
6955 ISD::LoadExtType ExtType = MGT->getExtensionType();
6956 ISD::MemIndexType IndexType = MGT->getIndexType();
6957
6958 // SVE supports zero (and so undef) passthrough values only, everything else
6959 // must be handled manually by an explicit select on the load's output.
6960 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
6961 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6962 SDValue Load =
6963 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6964 MGT->getMemOperand(), IndexType, ExtType);
6965 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6966 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
6967 }
6968
6969 bool IsScaled = MGT->isIndexScaled();
6970 bool IsSigned = MGT->isIndexSigned();
6971
6972 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6973 // must be calculated before hand.
6974 uint64_t ScaleVal = Scale->getAsZExtVal();
6975 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6976 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6977 EVT IndexVT = Index.getValueType();
6978 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6979 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6980 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6981
6982 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6983 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6984 MGT->getMemOperand(), IndexType, ExtType);
6985 }
6986
6987 // Lower fixed length gather to a scalable equivalent.
6988 if (VT.isFixedLengthVector()) {
6989 assert(Subtarget->useSVEForFixedLengthVectors() &&
6990 "Cannot lower when not using SVE for fixed vectors!");
6991
6992 // NOTE: Handle floating-point as if integer then bitcast the result.
6993 EVT DataVT = VT.changeVectorElementTypeToInteger();
6994 MemVT = MemVT.changeVectorElementTypeToInteger();
6995
6996 // Find the smallest integer fixed length vector we can use for the gather.
6997 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6998 if (DataVT.getVectorElementType() == MVT::i64 ||
6999 Index.getValueType().getVectorElementType() == MVT::i64 ||
7000 Mask.getValueType().getVectorElementType() == MVT::i64)
7001 PromotedVT = VT.changeVectorElementType(MVT::i64);
7002
7003 // Promote vector operands except for passthrough, which we know is either
7004 // undef or zero, and thus best constructed directly.
7005 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7006 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
7007 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
7008
7009 // A promoted result type forces the need for an extending load.
7010 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
7011 ExtType = ISD::EXTLOAD;
7012
7013 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
7014
7015 // Convert fixed length vector operands to scalable.
7016 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
7017 Index = convertToScalableVector(DAG, ContainerVT, Index);
7019 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
7020 : DAG.getConstant(0, DL, ContainerVT);
7021
7022 // Emit equivalent scalable vector gather.
7023 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
7024 SDValue Load =
7025 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
7026 Ops, MGT->getMemOperand(), IndexType, ExtType);
7027
7028 // Extract fixed length data then convert to the required result type.
7029 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
7030 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
7031 if (VT.isFloatingPoint())
7032 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
7033
7034 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7035 }
7036
7037 // Everything else is legal.
7038 return Op;
7039}
7040
7041SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
7042 SelectionDAG &DAG) const {
7043 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
7044
7045 SDLoc DL(Op);
7046 SDValue Chain = MSC->getChain();
7047 SDValue StoreVal = MSC->getValue();
7048 SDValue Mask = MSC->getMask();
7049 SDValue BasePtr = MSC->getBasePtr();
7050 SDValue Index = MSC->getIndex();
7051 SDValue Scale = MSC->getScale();
7052 EVT VT = StoreVal.getValueType();
7053 EVT MemVT = MSC->getMemoryVT();
7054 ISD::MemIndexType IndexType = MSC->getIndexType();
7055 bool Truncating = MSC->isTruncatingStore();
7056
7057 bool IsScaled = MSC->isIndexScaled();
7058 bool IsSigned = MSC->isIndexSigned();
7059
7060 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
7061 // must be calculated before hand.
7062 uint64_t ScaleVal = Scale->getAsZExtVal();
7063 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
7064 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
7065 EVT IndexVT = Index.getValueType();
7066 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
7067 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
7068 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
7069
7070 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
7071 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
7072 MSC->getMemOperand(), IndexType, Truncating);
7073 }
7074
7075 // Lower fixed length scatter to a scalable equivalent.
7076 if (VT.isFixedLengthVector()) {
7077 assert(Subtarget->useSVEForFixedLengthVectors() &&
7078 "Cannot lower when not using SVE for fixed vectors!");
7079
7080 // Once bitcast we treat floating-point scatters as if integer.
7081 if (VT.isFloatingPoint()) {
7083 MemVT = MemVT.changeVectorElementTypeToInteger();
7084 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
7085 }
7086
7087 // Find the smallest integer fixed length vector we can use for the scatter.
7088 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
7089 if (VT.getVectorElementType() == MVT::i64 ||
7090 Index.getValueType().getVectorElementType() == MVT::i64 ||
7091 Mask.getValueType().getVectorElementType() == MVT::i64)
7092 PromotedVT = VT.changeVectorElementType(MVT::i64);
7093
7094 // Promote vector operands.
7095 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7096 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
7097 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
7098 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
7099
7100 // A promoted value type forces the need for a truncating store.
7101 if (PromotedVT != VT)
7102 Truncating = true;
7103
7104 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
7105
7106 // Convert fixed length vector operands to scalable.
7107 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
7108 Index = convertToScalableVector(DAG, ContainerVT, Index);
7110 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
7111
7112 // Emit equivalent scalable vector scatter.
7113 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
7114 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
7115 MSC->getMemOperand(), IndexType, Truncating);
7116 }
7117
7118 // Everything else is legal.
7119 return Op;
7120}
7121
7122SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
7123 SDLoc DL(Op);
7124 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
7125 assert(LoadNode && "Expected custom lowering of a masked load node");
7126 EVT VT = Op->getValueType(0);
7127
7128 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
7129 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
7130
7131 SDValue PassThru = LoadNode->getPassThru();
7132 SDValue Mask = LoadNode->getMask();
7133
7134 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
7135 return Op;
7136
7138 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
7139 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
7140 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
7141 LoadNode->getExtensionType());
7142
7143 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
7144
7145 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7146}
7147
7148// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
7150 EVT VT, EVT MemVT,
7151 SelectionDAG &DAG) {
7152 assert(VT.isVector() && "VT should be a vector type");
7153 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
7154
7155 SDValue Value = ST->getValue();
7156
7157 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
7158 // the word lane which represent the v4i8 subvector. It optimizes the store
7159 // to:
7160 //
7161 // xtn v0.8b, v0.8h
7162 // str s0, [x0]
7163
7164 SDValue Undef = DAG.getUNDEF(MVT::i16);
7165 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
7166 {Undef, Undef, Undef, Undef});
7167
7168 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
7169 Value, UndefVec);
7170 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
7171
7172 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
7173 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
7174 Trunc, DAG.getConstant(0, DL, MVT::i64));
7175
7176 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
7177 ST->getBasePtr(), ST->getMemOperand());
7178}
7179
7181 SDLoc DL(Op);
7182 SDValue Src = Op.getOperand(0);
7183 MVT DestVT = Op.getSimpleValueType();
7184 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7186
7187 unsigned SrcAS = N->getSrcAddressSpace();
7188 unsigned DestAS = N->getDestAddressSpace();
7189 assert(SrcAS != DestAS &&
7190 "addrspacecast must be between different address spaces");
7191 assert(TLI.getTargetMachine().getPointerSize(SrcAS) !=
7192 TLI.getTargetMachine().getPointerSize(DestAS) &&
7193 "addrspacecast must be between different ptr sizes");
7194 (void)TLI;
7195
7196 if (SrcAS == ARM64AS::PTR32_SPTR) {
7197 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, Src,
7198 DAG.getTargetConstant(0, DL, DestVT));
7199 } else if (SrcAS == ARM64AS::PTR32_UPTR) {
7200 return DAG.getNode(ISD::ZERO_EXTEND, DL, DestVT, Src,
7201 DAG.getTargetConstant(0, DL, DestVT));
7202 } else if ((DestAS == ARM64AS::PTR32_SPTR) ||
7203 (DestAS == ARM64AS::PTR32_UPTR)) {
7204 SDValue Ext = DAG.getAnyExtOrTrunc(Src, DL, DestVT);
7205 SDValue Trunc = DAG.getZeroExtendInReg(Ext, DL, DestVT);
7206 return Trunc;
7207 } else {
7208 return Src;
7209 }
7210}
7211
7212// Custom lowering for any store, vector or scalar and/or default or with
7213// a truncate operations. Currently only custom lower truncate operation
7214// from vector v4i16 to v4i8 or volatile stores of i128.
7215SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
7216 SelectionDAG &DAG) const {
7217 SDLoc Dl(Op);
7218 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
7219 assert (StoreNode && "Can only custom lower store nodes");
7220
7221 SDValue Value = StoreNode->getValue();
7222
7223 EVT VT = Value.getValueType();
7224 EVT MemVT = StoreNode->getMemoryVT();
7225
7226 if (VT.isVector()) {
7228 VT,
7229 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
7230 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
7231
7232 unsigned AS = StoreNode->getAddressSpace();
7233 Align Alignment = StoreNode->getAlign();
7234 if (Alignment < MemVT.getStoreSize() &&
7235 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
7236 StoreNode->getMemOperand()->getFlags(),
7237 nullptr)) {
7238 return scalarizeVectorStore(StoreNode, DAG);
7239 }
7240
7241 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
7242 MemVT == MVT::v4i8) {
7243 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
7244 }
7245 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
7246 // the custom lowering, as there are no un-paired non-temporal stores and
7247 // legalization will break up 256 bit inputs.
7248 ElementCount EC = MemVT.getVectorElementCount();
7249 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
7250 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
7251 (MemVT.getScalarSizeInBits() == 8u ||
7252 MemVT.getScalarSizeInBits() == 16u ||
7253 MemVT.getScalarSizeInBits() == 32u ||
7254 MemVT.getScalarSizeInBits() == 64u)) {
7255 SDValue Lo =
7258 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
7259 SDValue Hi =
7262 StoreNode->getValue(),
7263 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
7265 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
7266 {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo),
7267 DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()},
7268 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7269 return Result;
7270 }
7271 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
7272 return LowerStore128(Op, DAG);
7273 } else if (MemVT == MVT::i64x8) {
7274 SDValue Value = StoreNode->getValue();
7275 assert(Value->getValueType(0) == MVT::i64x8);
7276 SDValue Chain = StoreNode->getChain();
7277 SDValue Base = StoreNode->getBasePtr();
7278 EVT PtrVT = Base.getValueType();
7279 for (unsigned i = 0; i < 8; i++) {
7280 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
7281 Value, DAG.getConstant(i, Dl, MVT::i32));
7282 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
7283 DAG.getConstant(i * 8, Dl, PtrVT));
7284 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
7285 StoreNode->getBaseAlign());
7286 }
7287 return Chain;
7288 }
7289
7290 return SDValue();
7291}
7292
7293/// Lower atomic or volatile 128-bit stores to a single STP instruction.
7294SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
7295 SelectionDAG &DAG) const {
7296 MemSDNode *StoreNode = cast<MemSDNode>(Op);
7297 assert(StoreNode->getMemoryVT() == MVT::i128);
7298 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
7299
7300 bool IsStoreRelease =
7302 if (StoreNode->isAtomic())
7303 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
7304 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
7307
7308 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
7309 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
7310 ? StoreNode->getOperand(1)
7311 : StoreNode->getOperand(2);
7312 SDLoc DL(Op);
7313 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
7314 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
7315 if (DAG.getDataLayout().isBigEndian())
7316 std::swap(StoreValue.first, StoreValue.second);
7318 Opcode, DL, DAG.getVTList(MVT::Other),
7319 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
7320 StoreNode->getBasePtr()},
7321 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7322 return Result;
7323}
7324
7325/// Helper function to optimize loads of extended small vectors.
7326/// These patterns would otherwise get scalarized into inefficient sequences.
7328 const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
7329 if (!isEligibleForSmallVectorLoadOpt(Load, Subtarget))
7330 return SDValue();
7331
7332 EVT MemVT = Load->getMemoryVT();
7333 EVT ResVT = Load->getValueType(0);
7334 unsigned NumElts = ResVT.getVectorNumElements();
7335 unsigned DstEltBits = ResVT.getScalarSizeInBits();
7336 unsigned SrcEltBits = MemVT.getScalarSizeInBits();
7337
7338 unsigned ExtOpcode;
7339 switch (Load->getExtensionType()) {
7340 case ISD::EXTLOAD:
7341 case ISD::ZEXTLOAD:
7342 ExtOpcode = ISD::ZERO_EXTEND;
7343 break;
7344 case ISD::SEXTLOAD:
7345 ExtOpcode = ISD::SIGN_EXTEND;
7346 break;
7347 case ISD::NON_EXTLOAD:
7348 return SDValue();
7349 }
7350
7351 SDLoc DL(Load);
7352 SDValue Chain = Load->getChain();
7353 SDValue BasePtr = Load->getBasePtr();
7354 const MachinePointerInfo &PtrInfo = Load->getPointerInfo();
7355 Align Alignment = Load->getAlign();
7356
7357 // Load the data as an FP scalar to avoid issues with integer loads.
7358 unsigned LoadBits = MemVT.getStoreSizeInBits();
7359 MVT ScalarLoadType = MVT::getFloatingPointVT(LoadBits);
7360 SDValue ScalarLoad =
7361 DAG.getLoad(ScalarLoadType, DL, Chain, BasePtr, PtrInfo, Alignment);
7362
7363 MVT ScalarToVecTy = MVT::getVectorVT(ScalarLoadType, 128 / LoadBits);
7364 SDValue ScalarToVec =
7365 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ScalarToVecTy, ScalarLoad);
7366 MVT BitcastTy =
7367 MVT::getVectorVT(MVT::getIntegerVT(SrcEltBits), 128 / SrcEltBits);
7368 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, BitcastTy, ScalarToVec);
7369
7370 SDValue Res = Bitcast;
7371 unsigned CurrentEltBits = Res.getValueType().getScalarSizeInBits();
7372 unsigned CurrentNumElts = Res.getValueType().getVectorNumElements();
7373 while (CurrentEltBits < DstEltBits) {
7374 if (Res.getValueSizeInBits() >= 128) {
7375 CurrentNumElts = CurrentNumElts / 2;
7376 MVT ExtractVT =
7377 MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
7378 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Res,
7379 DAG.getConstant(0, DL, MVT::i64));
7380 }
7381 CurrentEltBits = CurrentEltBits * 2;
7382 MVT ExtVT =
7383 MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
7384 Res = DAG.getNode(ExtOpcode, DL, ExtVT, Res);
7385 }
7386
7387 if (CurrentNumElts != NumElts) {
7388 MVT FinalVT = MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), NumElts);
7389 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FinalVT, Res,
7390 DAG.getConstant(0, DL, MVT::i64));
7391 }
7392
7393 return DAG.getMergeValues({Res, ScalarLoad.getValue(1)}, DL);
7394}
7395
7396SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
7397 SelectionDAG &DAG) const {
7398 SDLoc DL(Op);
7399 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
7400 assert(LoadNode && "Expected custom lowering of a load node");
7401
7402 if (SDValue Result = tryLowerSmallVectorExtLoad(LoadNode, DAG))
7403 return Result;
7404
7405 if (LoadNode->getMemoryVT() == MVT::i64x8) {
7407 SDValue Base = LoadNode->getBasePtr();
7408 SDValue Chain = LoadNode->getChain();
7409 EVT PtrVT = Base.getValueType();
7410 for (unsigned i = 0; i < 8; i++) {
7411 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
7412 DAG.getConstant(i * 8, DL, PtrVT));
7413 SDValue Part =
7414 DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
7415 LoadNode->getBaseAlign());
7416 Ops.push_back(Part);
7417 Chain = SDValue(Part.getNode(), 1);
7418 }
7419 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
7420 return DAG.getMergeValues({Loaded, Chain}, DL);
7421 }
7422
7423 return SDValue();
7424}
7425
7426// Convert to ContainerVT with no-op casts where possible.
7428 SelectionDAG &DAG) {
7429 EVT VecVT = Vec.getValueType();
7430 if (VecVT.isFloatingPoint()) {
7431 // Use no-op casts for floating-point types.
7432 EVT PackedVT = getPackedSVEVectorVT(VecVT.getScalarType());
7433 Vec = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedVT, Vec);
7434 Vec = DAG.getNode(AArch64ISD::NVCAST, DL, ContainerVT, Vec);
7435 } else {
7436 // Extend integers (may not be a no-op).
7437 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);
7438 }
7439 return Vec;
7440}
7441
7442// Convert to VecVT with no-op casts where possible.
7444 SelectionDAG &DAG) {
7445 if (VecVT.isFloatingPoint()) {
7446 // Use no-op casts for floating-point types.
7447 EVT PackedVT = getPackedSVEVectorVT(VecVT.getScalarType());
7448 Vec = DAG.getNode(AArch64ISD::NVCAST, DL, PackedVT, Vec);
7449 Vec = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VecVT, Vec);
7450 } else {
7451 // Truncate integers (may not be a no-op).
7452 Vec = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Vec);
7453 }
7454 return Vec;
7455}
7456
7457SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
7458 SelectionDAG &DAG) const {
7459 SDLoc DL(Op);
7460 SDValue Vec = Op.getOperand(0);
7461 SDValue Mask = Op.getOperand(1);
7462 SDValue Passthru = Op.getOperand(2);
7463 EVT VecVT = Vec.getValueType();
7464 EVT MaskVT = Mask.getValueType();
7465 EVT ElmtVT = VecVT.getVectorElementType();
7466 const bool IsFixedLength = VecVT.isFixedLengthVector();
7467 const bool HasPassthru = !Passthru.isUndef();
7468 unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();
7469 EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7470
7471 assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");
7472
7473 if (!Subtarget->isSVEAvailable())
7474 return SDValue();
7475
7476 if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)
7477 return SDValue();
7478
7479 // Only <vscale x {4|2} x {i32|i64}> supported for compact.
7480 if (MinElmts != 2 && MinElmts != 4)
7481 return SDValue();
7482
7483 // We can use the SVE register containing the NEON vector in its lowest bits.
7484 if (IsFixedLength) {
7485 EVT ScalableVecVT =
7486 MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7487 EVT ScalableMaskVT = MVT::getScalableVectorVT(
7488 MaskVT.getVectorElementType().getSimpleVT(), MinElmts);
7489
7490 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7491 DAG.getUNDEF(ScalableVecVT), Vec,
7492 DAG.getConstant(0, DL, MVT::i64));
7493 Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT,
7494 DAG.getUNDEF(ScalableMaskVT), Mask,
7495 DAG.getConstant(0, DL, MVT::i64));
7497 ScalableMaskVT.changeVectorElementType(MVT::i1), Mask);
7498 Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7499 DAG.getUNDEF(ScalableVecVT), Passthru,
7500 DAG.getConstant(0, DL, MVT::i64));
7501
7502 VecVT = Vec.getValueType();
7503 MaskVT = Mask.getValueType();
7504 }
7505
7506 // Get legal type for compact instruction
7507 EVT ContainerVT = getSVEContainerType(VecVT);
7508
7509 // Convert to 32 or 64 bits for smaller types, as these are the only supported
7510 // sizes for compact.
7511 Vec = convertToSVEContainerType(DL, Vec, ContainerVT, DAG);
7512
7513 SDValue Compressed = DAG.getNode(
7515 DAG.getTargetConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask,
7516 Vec);
7517
7518 // compact fills with 0s, so if our passthru is all 0s, do nothing here.
7519 if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {
7520 SDValue Offset = DAG.getNode(
7521 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7522 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask,
7523 Mask);
7524
7525 SDValue IndexMask = DAG.getNode(
7526 ISD::INTRINSIC_WO_CHAIN, DL, MaskVT,
7527 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64),
7528 DAG.getConstant(0, DL, MVT::i64), Offset);
7529
7530 Compressed =
7531 DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru);
7532 }
7533
7534 // If we changed the element type before, we need to convert it back.
7535 if (ElmtVT.isFloatingPoint())
7536 Compressed = convertFromSVEContainerType(DL, Compressed, VecVT, DAG);
7537
7538 // Extracting from a legal SVE type before truncating produces better code.
7539 if (IsFixedLength) {
7540 EVT FixedSubVector = VecVT.isInteger()
7541 ? FixedVecVT.changeVectorElementType(
7542 ContainerVT.getVectorElementType())
7543 : FixedVecVT;
7544 Compressed = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FixedSubVector,
7545 Compressed, DAG.getConstant(0, DL, MVT::i64));
7546 VecVT = FixedVecVT;
7547 }
7548
7549 if (VecVT.isInteger())
7550 Compressed = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
7551
7552 return Compressed;
7553}
7554
7555// Generate SUBS and CSEL for integer abs.
7556SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7557 MVT VT = Op.getSimpleValueType();
7558
7559 if (VT.isVector())
7560 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
7561
7562 SDLoc DL(Op);
7563 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, VT);
7564
7565 // Generate SUBS & CSEL.
7566 SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
7567 Op.getOperand(0), DAG.getConstant(0, DL, VT));
7568 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7569 getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1));
7570}
7571
7573 SDValue Chain = Op.getOperand(0);
7574 SDValue Cond = Op.getOperand(1);
7575 SDValue Dest = Op.getOperand(2);
7576
7578 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
7579 SDLoc DL(Op);
7580 SDValue CCVal = getCondCode(DAG, CC);
7581 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
7582 Cmp);
7583 }
7584
7585 return SDValue();
7586}
7587
7588// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7589// FSHL is converted to FSHR before deciding what to do with it
7591 SDValue Shifts = Op.getOperand(2);
7592 // Check if the shift amount is a constant and normalise to [0, SrcBitLen)
7593 // If opcode is FSHL, convert it to FSHR
7594 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
7595 SDLoc DL(Op);
7596 MVT VT = Op.getSimpleValueType();
7597 unsigned int NewShiftNo = ShiftNo->getZExtValue() % VT.getFixedSizeInBits();
7598
7599 if (Op.getOpcode() == ISD::FSHL) {
7600 if (NewShiftNo == 0)
7601 return Op.getOperand(0);
7602
7603 NewShiftNo = VT.getFixedSizeInBits() - NewShiftNo;
7604 return DAG.getNode(
7605 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7606 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7607 }
7608
7609 if (Op.getOpcode() == ISD::FSHR) {
7610 if (NewShiftNo == 0)
7611 return Op.getOperand(1);
7612
7613 if (ShiftNo->getZExtValue() == NewShiftNo)
7614 return Op;
7615
7616 // Rewrite using the normalised shift amount.
7617 return DAG.getNode(
7618 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7619 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7620 }
7621 }
7622
7623 return SDValue();
7624}
7625
7627 SDValue X = Op.getOperand(0);
7628 EVT XScalarTy = X.getValueType();
7629 SDValue Exp = Op.getOperand(1);
7630
7631 SDLoc DL(Op);
7632 EVT XVT, ExpVT;
7633 switch (Op.getSimpleValueType().SimpleTy) {
7634 default:
7635 return SDValue();
7636 case MVT::bf16:
7637 case MVT::f16:
7638 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
7639 [[fallthrough]];
7640 case MVT::f32:
7641 XVT = MVT::nxv4f32;
7642 ExpVT = MVT::nxv4i32;
7643 break;
7644 case MVT::f64:
7645 XVT = MVT::nxv2f64;
7646 ExpVT = MVT::nxv2i64;
7647 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
7648 break;
7649 }
7650
7651 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7652 SDValue VX =
7653 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
7654 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
7655 DAG.getUNDEF(ExpVT), Exp, Zero);
7656 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
7657 AArch64SVEPredPattern::all);
7658 SDValue FScale = DAG.getNode(
7660 DAG.getTargetConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64), VPg,
7661 VX, VExp);
7662 SDValue Final =
7663 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
7664 if (X.getValueType() != XScalarTy)
7665 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
7666 DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));
7667 return Final;
7668}
7669
7670SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7671 SelectionDAG &DAG) const {
7672 return Op.getOperand(0);
7673}
7674
7675SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7676 SelectionDAG &DAG) const {
7677 SDValue Chain = Op.getOperand(0);
7678 SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes
7679 SDValue FPtr = Op.getOperand(2); // nested function
7680 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7681
7682 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7683
7684 // ldr NestReg, .+16
7685 // ldr x17, .+20
7686 // br x17
7687 // .word 0
7688 // .nest: .qword nest
7689 // .fptr: .qword fptr
7690 SDValue OutChains[5];
7691
7692 const Function *Func =
7693 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7694 CallingConv::ID CC = Func->getCallingConv();
7695 unsigned NestReg;
7696
7697 switch (CC) {
7698 default:
7699 NestReg = 0x0f; // X15
7700 break;
7702 // Must be kept in sync with AArch64CallingConv.td
7703 NestReg = 0x04; // X4
7704 break;
7705 }
7706
7707 const char FptrReg = 0x11; // X17
7708
7709 SDValue Addr = Trmp;
7710
7711 SDLoc DL(Op);
7712 OutChains[0] = DAG.getStore(
7713 Chain, DL, DAG.getConstant(0x58000080u | NestReg, DL, MVT::i32), Addr,
7714 MachinePointerInfo(TrmpAddr));
7715
7716 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7717 DAG.getConstant(4, DL, MVT::i64));
7718 OutChains[1] = DAG.getStore(
7719 Chain, DL, DAG.getConstant(0x580000b0u | FptrReg, DL, MVT::i32), Addr,
7720 MachinePointerInfo(TrmpAddr, 4));
7721
7722 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7723 DAG.getConstant(8, DL, MVT::i64));
7724 OutChains[2] =
7725 DAG.getStore(Chain, DL, DAG.getConstant(0xd61f0220u, DL, MVT::i32), Addr,
7726 MachinePointerInfo(TrmpAddr, 8));
7727
7728 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7729 DAG.getConstant(16, DL, MVT::i64));
7730 OutChains[3] =
7731 DAG.getStore(Chain, DL, Nest, Addr, MachinePointerInfo(TrmpAddr, 16));
7732
7733 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7734 DAG.getConstant(24, DL, MVT::i64));
7735 OutChains[4] =
7736 DAG.getStore(Chain, DL, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24));
7737
7738 SDValue StoreToken = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
7739
7740 SDValue EndOfTrmp = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7741 DAG.getConstant(12, DL, MVT::i64));
7742
7743 // Call clear cache on the trampoline instructions.
7744 return DAG.getNode(ISD::CLEAR_CACHE, DL, MVT::Other, StoreToken, Trmp,
7745 EndOfTrmp);
7746}
7747
7748SDValue AArch64TargetLowering::LowerFMUL(SDValue Op, SelectionDAG &DAG) const {
7749 SDLoc DL(Op);
7750 EVT VT = Op.getValueType();
7751 if (VT.getScalarType() != MVT::bf16 ||
7752 (Subtarget->hasSVEB16B16() &&
7753 Subtarget->isNonStreamingSVEorSME2Available()))
7754 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
7755
7756 assert(Subtarget->hasBF16() && "Expected +bf16 for custom FMUL lowering");
7757 assert((VT == MVT::nxv4bf16 || VT == MVT::nxv8bf16 || VT == MVT::v8bf16) &&
7758 "Unexpected FMUL VT");
7759
7760 auto MakeGetIntrinsic = [&](Intrinsic::ID IID) {
7761 return [&, IID](EVT VT, auto... Ops) {
7762 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
7763 DAG.getConstant(IID, DL, MVT::i32), Ops...);
7764 };
7765 };
7766
7767 auto Reinterpret = [&](SDValue Value, EVT VT) {
7768 EVT SrcVT = Value.getValueType();
7769 if (VT == SrcVT)
7770 return Value;
7771 if (SrcVT.isFixedLengthVector())
7772 return convertToScalableVector(DAG, VT, Value);
7773 if (VT.isFixedLengthVector())
7774 return convertFromScalableVector(DAG, VT, Value);
7775 return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Value);
7776 };
7777
7778 bool UseSVEBFMLAL = VT.isScalableVector();
7779 auto FCVT = MakeGetIntrinsic(Intrinsic::aarch64_sve_fcvt_bf16f32_v2);
7780 auto FCVTNT = MakeGetIntrinsic(Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2);
7781
7782 // Note: The NEON BFMLAL[BT] reads even/odd lanes like the SVE variant.
7783 // This does not match BFCVTN[2], so we use SVE to convert back to bf16.
7784 auto BFMLALB =
7785 MakeGetIntrinsic(UseSVEBFMLAL ? Intrinsic::aarch64_sve_bfmlalb
7786 : Intrinsic::aarch64_neon_bfmlalb);
7787 auto BFMLALT =
7788 MakeGetIntrinsic(UseSVEBFMLAL ? Intrinsic::aarch64_sve_bfmlalt
7789 : Intrinsic::aarch64_neon_bfmlalt);
7790
7791 EVT AccVT = UseSVEBFMLAL ? MVT::nxv4f32 : MVT::v4f32;
7792 SDValue Zero = DAG.getNeutralElement(ISD::FADD, DL, AccVT, Op->getFlags());
7793 SDValue Pg = getPredicateForVector(DAG, DL, AccVT);
7794
7795 // Lower bf16 FMUL as a pair (VT == [nx]v8bf16) of BFMLAL top/bottom
7796 // instructions. These result in two f32 vectors, which can be converted back
7797 // to bf16 with FCVT and FCVTNT.
7798 SDValue LHS = Op.getOperand(0);
7799 SDValue RHS = Op.getOperand(1);
7800
7801 // All SVE intrinsics expect to operate on full bf16 vector types.
7802 if (UseSVEBFMLAL) {
7803 LHS = Reinterpret(LHS, MVT::nxv8bf16);
7804 RHS = Reinterpret(RHS, MVT::nxv8bf16);
7805 }
7806
7807 SDValue BottomF32 = Reinterpret(BFMLALB(AccVT, Zero, LHS, RHS), MVT::nxv4f32);
7808 SDValue BottomBF16 =
7809 FCVT(MVT::nxv8bf16, DAG.getPOISON(MVT::nxv8bf16), Pg, BottomF32);
7810 // Note: nxv4bf16 only uses even lanes.
7811 if (VT == MVT::nxv4bf16)
7812 return Reinterpret(BottomBF16, VT);
7813
7814 SDValue TopF32 = Reinterpret(BFMLALT(AccVT, Zero, LHS, RHS), MVT::nxv4f32);
7815 SDValue TopBF16 = FCVTNT(MVT::nxv8bf16, BottomBF16, Pg, TopF32);
7816 return Reinterpret(TopBF16, VT);
7817}
7818
7819SDValue AArch64TargetLowering::LowerFMA(SDValue Op, SelectionDAG &DAG) const {
7820 SDValue OpA = Op->getOperand(0);
7821 SDValue OpB = Op->getOperand(1);
7822 SDValue OpC = Op->getOperand(2);
7823 EVT VT = Op.getValueType();
7824 SDLoc DL(Op);
7825
7826 assert(VT.isVector() && "Scalar fma lowering should be handled by patterns");
7827
7828 // Bail early if we're definitely not looking to merge FNEGs into the FMA.
7829 if (VT != MVT::v8f16 && VT != MVT::v4f32 && VT != MVT::v2f64)
7830 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
7831
7832 if (OpC.getOpcode() != ISD::FNEG)
7833 return useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())
7834 ? LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED)
7835 : Op; // Fallback to NEON lowering.
7836
7837 // Convert FMA/FNEG nodes to SVE to enable the following patterns:
7838 // fma(a, b, neg(c)) -> fnmls(a, b, c)
7839 // fma(neg(a), b, neg(c)) -> fnmla(a, b, c)
7840 // fma(a, neg(b), neg(c)) -> fnmla(a, b, c)
7841 SDValue Pg = getPredicateForVector(DAG, DL, VT);
7842 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
7843
7844 auto ConvertToScalableFnegMt = [&](SDValue Op) {
7845 if (Op.getOpcode() == ISD::FNEG)
7846 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
7847 return convertToScalableVector(DAG, ContainerVT, Op);
7848 };
7849
7850 OpA = ConvertToScalableFnegMt(OpA);
7851 OpB = ConvertToScalableFnegMt(OpB);
7852 OpC = ConvertToScalableFnegMt(OpC);
7853
7854 SDValue ScalableRes =
7855 DAG.getNode(AArch64ISD::FMA_PRED, DL, ContainerVT, Pg, OpA, OpB, OpC);
7856 return convertFromScalableVector(DAG, VT, ScalableRes);
7857}
7858
7860 SelectionDAG &DAG) const {
7861 LLVM_DEBUG(dbgs() << "Custom lowering: ");
7862 LLVM_DEBUG(Op.dump());
7863
7864 switch (Op.getOpcode()) {
7865 default:
7866 llvm_unreachable("unimplemented operand");
7867 return SDValue();
7870 return LowerLOOP_DEPENDENCE_MASK(Op, DAG);
7871 case ISD::BITCAST:
7872 return LowerBITCAST(Op, DAG);
7873 case ISD::GlobalAddress:
7874 return LowerGlobalAddress(Op, DAG);
7876 return LowerGlobalTLSAddress(Op, DAG);
7878 return LowerPtrAuthGlobalAddress(Op, DAG);
7879 case ISD::ADJUST_TRAMPOLINE:
7880 return LowerADJUST_TRAMPOLINE(Op, DAG);
7881 case ISD::INIT_TRAMPOLINE:
7882 return LowerINIT_TRAMPOLINE(Op, DAG);
7883 case ISD::SETCC:
7884 case ISD::STRICT_FSETCC:
7886 return LowerSETCC(Op, DAG);
7887 case ISD::SETCCCARRY:
7888 return LowerSETCCCARRY(Op, DAG);
7889 case ISD::BRCOND:
7890 return LowerBRCOND(Op, DAG);
7891 case ISD::BR_CC:
7892 return LowerBR_CC(Op, DAG);
7893 case ISD::SELECT:
7894 return LowerSELECT(Op, DAG);
7895 case ISD::SELECT_CC:
7896 return LowerSELECT_CC(Op, DAG);
7897 case ISD::JumpTable:
7898 return LowerJumpTable(Op, DAG);
7899 case ISD::BR_JT:
7900 return LowerBR_JT(Op, DAG);
7901 case ISD::BRIND:
7902 return LowerBRIND(Op, DAG);
7903 case ISD::ConstantPool:
7904 return LowerConstantPool(Op, DAG);
7905 case ISD::BlockAddress:
7906 return LowerBlockAddress(Op, DAG);
7907 case ISD::VASTART:
7908 return LowerVASTART(Op, DAG);
7909 case ISD::VACOPY:
7910 return LowerVACOPY(Op, DAG);
7911 case ISD::VAARG:
7912 return LowerVAARG(Op, DAG);
7913 case ISD::UADDO_CARRY:
7914 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
7915 case ISD::USUBO_CARRY:
7916 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
7917 case ISD::SADDO_CARRY:
7918 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
7919 case ISD::SSUBO_CARRY:
7920 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
7921 case ISD::SADDO:
7922 case ISD::UADDO:
7923 case ISD::SSUBO:
7924 case ISD::USUBO:
7925 case ISD::SMULO:
7926 case ISD::UMULO:
7927 return LowerXALUO(Op, DAG);
7928 case ISD::FADD:
7929 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
7930 case ISD::FSUB:
7931 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
7932 case ISD::FMUL:
7933 return LowerFMUL(Op, DAG);
7934 case ISD::FMA:
7935 return LowerFMA(Op, DAG);
7936 case ISD::FDIV:
7937 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
7938 case ISD::FNEG:
7939 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
7940 case ISD::FCEIL:
7941 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
7942 case ISD::FFLOOR:
7943 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
7944 case ISD::FNEARBYINT:
7945 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
7946 case ISD::FRINT:
7947 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
7948 case ISD::FROUND:
7949 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
7950 case ISD::FROUNDEVEN:
7951 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
7952 case ISD::FTRUNC:
7953 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
7954 case ISD::FSQRT:
7955 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
7956 case ISD::FABS:
7957 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
7958 case ISD::FP_ROUND:
7960 return LowerFP_ROUND(Op, DAG);
7961 case ISD::FP_EXTEND:
7963 return LowerFP_EXTEND(Op, DAG);
7964 case ISD::FRAMEADDR:
7965 return LowerFRAMEADDR(Op, DAG);
7966 case ISD::SPONENTRY:
7967 return LowerSPONENTRY(Op, DAG);
7968 case ISD::RETURNADDR:
7969 return LowerRETURNADDR(Op, DAG);
7971 return LowerADDROFRETURNADDR(Op, DAG);
7973 return LowerCONCAT_VECTORS(Op, DAG);
7975 return LowerINSERT_VECTOR_ELT(Op, DAG);
7977 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7978 case ISD::BUILD_VECTOR:
7979 return LowerBUILD_VECTOR(Op, DAG);
7982 return LowerEXTEND_VECTOR_INREG(Op, DAG);
7984 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
7986 return LowerVECTOR_SHUFFLE(Op, DAG);
7987 case ISD::SPLAT_VECTOR:
7988 return LowerSPLAT_VECTOR(Op, DAG);
7990 return LowerEXTRACT_SUBVECTOR(Op, DAG);
7992 return LowerINSERT_SUBVECTOR(Op, DAG);
7993 case ISD::SDIV:
7994 case ISD::UDIV:
7995 return LowerDIV(Op, DAG);
7996 case ISD::SMIN:
7997 case ISD::UMIN:
7998 case ISD::SMAX:
7999 case ISD::UMAX:
8000 return LowerMinMax(Op, DAG);
8001 case ISD::SRA:
8002 case ISD::SRL:
8003 case ISD::SHL:
8004 return LowerVectorSRA_SRL_SHL(Op, DAG);
8005 case ISD::SHL_PARTS:
8006 case ISD::SRL_PARTS:
8007 case ISD::SRA_PARTS:
8008 return LowerShiftParts(Op, DAG);
8009 case ISD::CTPOP:
8010 case ISD::PARITY:
8011 return LowerCTPOP_PARITY(Op, DAG);
8012 case ISD::FCOPYSIGN:
8013 return LowerFCOPYSIGN(Op, DAG);
8014 case ISD::OR:
8015 return LowerVectorOR(Op, DAG);
8016 case ISD::XOR:
8017 return LowerXOR(Op, DAG);
8018 case ISD::PREFETCH:
8019 return LowerPREFETCH(Op, DAG);
8020 case ISD::SINT_TO_FP:
8021 case ISD::UINT_TO_FP:
8024 return LowerINT_TO_FP(Op, DAG);
8025 case ISD::FP_TO_SINT:
8026 case ISD::FP_TO_UINT:
8029 return LowerFP_TO_INT(Op, DAG);
8032 return LowerFP_TO_INT_SAT(Op, DAG);
8033 case ISD::GET_ROUNDING:
8034 return LowerGET_ROUNDING(Op, DAG);
8035 case ISD::SET_ROUNDING:
8036 return LowerSET_ROUNDING(Op, DAG);
8037 case ISD::GET_FPMODE:
8038 return LowerGET_FPMODE(Op, DAG);
8039 case ISD::SET_FPMODE:
8040 return LowerSET_FPMODE(Op, DAG);
8041 case ISD::RESET_FPMODE:
8042 return LowerRESET_FPMODE(Op, DAG);
8043 case ISD::MUL:
8044 return LowerMUL(Op, DAG);
8045 case ISD::MULHS:
8046 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
8047 case ISD::MULHU:
8048 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
8050 return LowerINTRINSIC_W_CHAIN(Op, DAG);
8052 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
8054 return LowerINTRINSIC_VOID(Op, DAG);
8055 case ISD::ATOMIC_STORE:
8056 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
8057 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
8058 return LowerStore128(Op, DAG);
8059 }
8060 return SDValue();
8061 case ISD::STORE:
8062 return LowerSTORE(Op, DAG);
8063 case ISD::MSTORE:
8064 return LowerMSTORE(Op, DAG);
8065 case ISD::MGATHER:
8066 return LowerMGATHER(Op, DAG);
8067 case ISD::MSCATTER:
8068 return LowerMSCATTER(Op, DAG);
8069 case ISD::VECREDUCE_SEQ_FADD:
8070 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
8071 case ISD::VECREDUCE_ADD:
8072 case ISD::VECREDUCE_AND:
8073 case ISD::VECREDUCE_OR:
8074 case ISD::VECREDUCE_XOR:
8075 case ISD::VECREDUCE_SMAX:
8076 case ISD::VECREDUCE_SMIN:
8077 case ISD::VECREDUCE_UMAX:
8078 case ISD::VECREDUCE_UMIN:
8079 case ISD::VECREDUCE_FADD:
8080 case ISD::VECREDUCE_FMAX:
8081 case ISD::VECREDUCE_FMIN:
8082 case ISD::VECREDUCE_FMAXIMUM:
8083 case ISD::VECREDUCE_FMINIMUM:
8084 return LowerVECREDUCE(Op, DAG);
8085 case ISD::VECREDUCE_MUL:
8086 case ISD::VECREDUCE_FMUL:
8087 return LowerVECREDUCE_MUL(Op, DAG);
8088 case ISD::ATOMIC_LOAD_AND:
8089 return LowerATOMIC_LOAD_AND(Op, DAG);
8090 case ISD::DYNAMIC_STACKALLOC:
8091 return LowerDYNAMIC_STACKALLOC(Op, DAG);
8092 case ISD::VSCALE:
8093 return LowerVSCALE(Op, DAG);
8095 return LowerVECTOR_COMPRESS(Op, DAG);
8096 case ISD::ANY_EXTEND:
8097 case ISD::SIGN_EXTEND:
8098 case ISD::ZERO_EXTEND:
8099 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
8100 case ISD::ADDRSPACECAST:
8101 return LowerADDRSPACECAST(Op, DAG);
8103 // Only custom lower when ExtraVT has a legal byte based element type.
8104 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
8105 EVT ExtraEltVT = ExtraVT.getVectorElementType();
8106 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
8107 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
8108 return SDValue();
8109
8110 return LowerToPredicatedOp(Op, DAG,
8111 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
8112 }
8113 case ISD::TRUNCATE:
8114 return LowerTRUNCATE(Op, DAG);
8115 case ISD::MLOAD:
8116 return LowerMLOAD(Op, DAG);
8117 case ISD::LOAD:
8118 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
8119 !Subtarget->isNeonAvailable()))
8120 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
8121 return LowerLOAD(Op, DAG);
8122 case ISD::ADD:
8123 case ISD::AND:
8124 case ISD::SUB:
8125 return LowerToScalableOp(Op, DAG);
8126 case ISD::FMAXIMUM:
8127 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
8128 case ISD::FMAXNUM:
8129 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
8130 case ISD::FMINIMUM:
8131 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
8132 case ISD::FMINNUM:
8133 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
8134 case ISD::VSELECT:
8135 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
8136 case ISD::ABS:
8137 return LowerABS(Op, DAG);
8138 case ISD::ABDS:
8139 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
8140 case ISD::ABDU:
8141 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
8142 case ISD::AVGFLOORS:
8143 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
8144 case ISD::AVGFLOORU:
8145 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
8146 case ISD::AVGCEILS:
8147 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
8148 case ISD::AVGCEILU:
8149 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
8150 case ISD::BITREVERSE:
8151 return LowerBitreverse(Op, DAG);
8152 case ISD::BSWAP:
8153 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
8154 case ISD::CTLZ:
8155 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
8156 case ISD::CTTZ:
8157 return LowerCTTZ(Op, DAG);
8158 case ISD::VECTOR_SPLICE:
8159 return LowerVECTOR_SPLICE(Op, DAG);
8161 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
8163 return LowerVECTOR_INTERLEAVE(Op, DAG);
8164 case ISD::GET_ACTIVE_LANE_MASK:
8165 return LowerGET_ACTIVE_LANE_MASK(Op, DAG);
8166 case ISD::LRINT:
8167 case ISD::LLRINT:
8168 if (Op.getValueType().isVector())
8169 return LowerVectorXRINT(Op, DAG);
8170 [[fallthrough]];
8171 case ISD::LROUND:
8172 case ISD::LLROUND: {
8173 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
8174 Op.getOperand(0).getValueType() == MVT::bf16) &&
8175 "Expected custom lowering of rounding operations only for f16");
8176 SDLoc DL(Op);
8177 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
8178 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
8179 }
8180 case ISD::STRICT_LROUND:
8182 case ISD::STRICT_LRINT:
8183 case ISD::STRICT_LLRINT: {
8184 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
8185 Op.getOperand(1).getValueType() == MVT::bf16) &&
8186 "Expected custom lowering of rounding operations only for f16");
8187 SDLoc DL(Op);
8188 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
8189 {Op.getOperand(0), Op.getOperand(1)});
8190 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
8191 {Ext.getValue(1), Ext.getValue(0)});
8192 }
8193 case ISD::WRITE_REGISTER: {
8194 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
8195 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
8196 SDLoc DL(Op);
8197
8198 SDValue Chain = Op.getOperand(0);
8199 SDValue SysRegName = Op.getOperand(1);
8200 std::pair<SDValue, SDValue> Pair =
8201 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
8202
8203 // chain = MSRR(chain, sysregname, lo, hi)
8204 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
8205 SysRegName, Pair.first, Pair.second);
8206
8207 return Result;
8208 }
8209 case ISD::FSHL:
8210 case ISD::FSHR:
8211 return LowerFunnelShift(Op, DAG);
8212 case ISD::FLDEXP:
8213 return LowerFLDEXP(Op, DAG);
8214 case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
8215 return LowerVECTOR_HISTOGRAM(Op, DAG);
8216 case ISD::PARTIAL_REDUCE_SMLA:
8217 case ISD::PARTIAL_REDUCE_UMLA:
8218 case ISD::PARTIAL_REDUCE_SUMLA:
8219 case ISD::PARTIAL_REDUCE_FMLA:
8220 return LowerPARTIAL_REDUCE_MLA(Op, DAG);
8221 }
8222}
8223
8225 return !Subtarget->useSVEForFixedLengthVectors();
8226}
8227
8229 EVT VT, bool OverrideNEON) const {
8230 if (!VT.isFixedLengthVector() || !VT.isSimple())
8231 return false;
8232
8233 // Don't use SVE for vectors we cannot scalarize if required.
8234 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
8235 // Fixed length predicates should be promoted to i8.
8236 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
8237 case MVT::i1:
8238 default:
8239 return false;
8240 case MVT::i8:
8241 case MVT::i16:
8242 case MVT::i32:
8243 case MVT::i64:
8244 case MVT::f16:
8245 case MVT::f32:
8246 case MVT::f64:
8247 break;
8248 }
8249
8250 // NEON-sized vectors can be emulated using SVE instructions.
8251 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
8252 return Subtarget->isSVEorStreamingSVEAvailable();
8253
8254 // Ensure NEON MVTs only belong to a single register class.
8255 if (VT.getFixedSizeInBits() <= 128)
8256 return false;
8257
8258 // Ensure wider than NEON code generation is enabled.
8259 if (!Subtarget->useSVEForFixedLengthVectors())
8260 return false;
8261
8262 // Don't use SVE for types that don't fit.
8263 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
8264 return false;
8265
8266 // TODO: Perhaps an artificial restriction, but worth having whilst getting
8267 // the base fixed length SVE support in place.
8268 if (!VT.isPow2VectorType())
8269 return false;
8270
8271 return true;
8272}
8273
8274//===----------------------------------------------------------------------===//
8275// Calling Convention Implementation
8276//===----------------------------------------------------------------------===//
8277
8278static unsigned getIntrinsicID(const SDNode *N) {
8279 unsigned Opcode = N->getOpcode();
8280 switch (Opcode) {
8281 default:
8284 unsigned IID = N->getConstantOperandVal(0);
8285 if (IID < Intrinsic::num_intrinsics)
8286 return IID;
8288 }
8289 }
8290}
8291
8293 SDValue N1) const {
8294 if (!N0.hasOneUse())
8295 return false;
8296
8297 unsigned IID = getIntrinsicID(N1.getNode());
8298 // Avoid reassociating expressions that can be lowered to smlal/umlal.
8299 if (IID == Intrinsic::aarch64_neon_umull ||
8300 N1.getOpcode() == AArch64ISD::UMULL ||
8301 IID == Intrinsic::aarch64_neon_smull ||
8302 N1.getOpcode() == AArch64ISD::SMULL)
8303 return N0.getOpcode() != ISD::ADD;
8304
8305 return true;
8306}
8307
8308/// Selects the correct CCAssignFn for a given CallingConvention value.
8310 bool IsVarArg) const {
8311 switch (CC) {
8312 default:
8313 reportFatalUsageError("unsupported calling convention");
8314 case CallingConv::GHC:
8315 return CC_AArch64_GHC;
8317 // The VarArg implementation makes assumptions about register
8318 // argument passing that do not hold for preserve_none, so we
8319 // instead fall back to C argument passing.
8320 // The non-vararg case is handled in the CC function itself.
8321 if (!IsVarArg)
8323 [[fallthrough]];
8324 case CallingConv::C:
8325 case CallingConv::Fast:
8329 case CallingConv::Swift:
8331 case CallingConv::Tail:
8332 case CallingConv::GRAAL:
8333 if (Subtarget->isTargetWindows()) {
8334 if (IsVarArg) {
8335 if (Subtarget->isWindowsArm64EC())
8338 }
8339 return CC_AArch64_Win64PCS;
8340 }
8341 if (!Subtarget->isTargetDarwin())
8342 return CC_AArch64_AAPCS;
8343 if (!IsVarArg)
8344 return CC_AArch64_DarwinPCS;
8345 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
8347 case CallingConv::Win64:
8348 if (IsVarArg) {
8349 if (Subtarget->isWindowsArm64EC())
8352 }
8353 return CC_AArch64_Win64PCS;
8355 if (Subtarget->isWindowsArm64EC())
8363 return CC_AArch64_AAPCS;
8368 }
8369}
8370
8371CCAssignFn *
8373 switch (CC) {
8374 default:
8375 return RetCC_AArch64_AAPCS;
8379 if (Subtarget->isWindowsArm64EC())
8381 return RetCC_AArch64_AAPCS;
8382 }
8383}
8384
8385static bool isPassedInFPR(EVT VT) {
8386 return VT.isFixedLengthVector() ||
8387 (VT.isFloatingPoint() && !VT.isScalableVector());
8388}
8389
8391 AArch64FunctionInfo &FuncInfo,
8392 SelectionDAG &DAG) {
8393 if (!FuncInfo.hasZT0SpillSlotIndex())
8394 FuncInfo.setZT0SpillSlotIndex(MFI.CreateSpillStackObject(64, Align(16)));
8395
8396 return DAG.getFrameIndex(
8397 FuncInfo.getZT0SpillSlotIndex(),
8399}
8400
8401// Emit a call to __arm_sme_save or __arm_sme_restore.
8403 SelectionDAG &DAG,
8405 SDValue Chain, bool IsSave) {
8408 FuncInfo->setSMESaveBufferUsed();
8410 Args.emplace_back(
8411 DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
8413
8414 RTLIB::Libcall LC =
8415 IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
8416 SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
8417 TLI.getPointerTy(DAG.getDataLayout()));
8418 auto *RetTy = Type::getVoidTy(*DAG.getContext());
8420 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8421 TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
8422 return TLI.LowerCallTo(CLI).second;
8423}
8424
8426 const AArch64TargetLowering &TLI,
8427 const AArch64RegisterInfo &TRI,
8428 AArch64FunctionInfo &FuncInfo,
8429 SelectionDAG &DAG) {
8430 // Conditionally restore the lazy save using a pseudo node.
8431 RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
8432 TPIDR2Object &TPIDR2 = FuncInfo.getTPIDR2Obj();
8433 SDValue RegMask = DAG.getRegisterMask(TRI.getCallPreservedMask(
8435 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8436 TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout()));
8437 SDValue TPIDR2_EL0 = DAG.getNode(
8438 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain,
8439 DAG.getTargetConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8440 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8441 // RESTORE_ZA pseudo.
8442 SDValue Glue;
8443 SDValue TPIDR2Block = DAG.getFrameIndex(
8444 TPIDR2.FrameIndex,
8446 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, TPIDR2Block, Glue);
8447 Chain =
8448 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8449 {Chain, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8450 RestoreRoutine, RegMask, Chain.getValue(1)});
8451 // Finally reset the TPIDR2_EL0 register to 0.
8452 Chain = DAG.getNode(
8453 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8454 DAG.getTargetConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8455 DAG.getConstant(0, DL, MVT::i64));
8456 TPIDR2.Uses++;
8457 return Chain;
8458}
8459
8460SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
8461 SelectionDAG &DAG) const {
8462 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8463 SDValue Glue = Chain.getValue(1);
8464
8465 MachineFunction &MF = DAG.getMachineFunction();
8466 auto &FuncInfo = *MF.getInfo<AArch64FunctionInfo>();
8467 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
8468 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
8469
8470 SMEAttrs SMEFnAttrs = FuncInfo.getSMEFnAttrs();
8471
8472 // The following conditions are true on entry to an exception handler:
8473 // - PSTATE.SM is 0.
8474 // - PSTATE.ZA is 0.
8475 // - TPIDR2_EL0 is null.
8476 // See:
8477 // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#exceptions
8478 //
8479 // Therefore, if the function that contains this exception handler is a
8480 // streaming[-compatible] function, we must re-enable streaming mode.
8481 //
8482 // These mode changes are usually optimized away in catch blocks as they
8483 // occur before the __cxa_begin_catch (which is a non-streaming function),
8484 // but are necessary in some cases (such as for cleanups).
8485 //
8486 // Additionally, if the function has ZA or ZT0 state, we must restore it.
8487
8488 // [COND_]SMSTART SM
8489 if (SMEFnAttrs.hasStreamingInterfaceOrBody())
8490 Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
8491 /*Glue*/ Glue, AArch64SME::Always);
8492 else if (SMEFnAttrs.hasStreamingCompatibleInterface())
8493 Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
8495
8496 if (getTM().useNewSMEABILowering())
8497 return Chain;
8498
8499 if (SMEFnAttrs.hasAgnosticZAInterface()) {
8500 // Restore full ZA
8501 Chain = emitSMEStateSaveRestore(*this, DAG, &FuncInfo, DL, Chain,
8502 /*IsSave=*/false);
8503 } else if (SMEFnAttrs.hasZAState() || SMEFnAttrs.hasZT0State()) {
8504 // SMSTART ZA
8505 Chain = DAG.getNode(
8506 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
8507 DAG.getTargetConstant(int32_t(AArch64SVCR::SVCRZA), DL, MVT::i32));
8508
8509 // Restore ZT0
8510 if (SMEFnAttrs.hasZT0State()) {
8511 SDValue ZT0FrameIndex =
8512 getZT0FrameIndex(MF.getFrameInfo(), FuncInfo, DAG);
8513 Chain =
8514 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8515 {Chain, DAG.getConstant(0, DL, MVT::i32), ZT0FrameIndex});
8516 }
8517
8518 // Restore ZA
8519 if (SMEFnAttrs.hasZAState())
8520 Chain = emitRestoreZALazySave(Chain, DL, *this, TRI, FuncInfo, DAG);
8521 }
8522
8523 return Chain;
8524}
8525
8526SDValue AArch64TargetLowering::LowerFormalArguments(
8527 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
8528 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
8529 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
8530 MachineFunction &MF = DAG.getMachineFunction();
8531 const Function &F = MF.getFunction();
8532 MachineFrameInfo &MFI = MF.getFrameInfo();
8533 bool IsWin64 =
8534 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8535 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
8536 (isVarArg && Subtarget->isWindowsArm64EC());
8537 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8538
8540 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
8542 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
8543 FuncInfo->setIsSVECC(true);
8544
8545 // Assign locations to all of the incoming arguments.
8547 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
8548
8549 // At this point, Ins[].VT may already be promoted to i32. To correctly
8550 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
8551 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
8552 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
8553 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
8554 // LocVT.
8555 unsigned NumArgs = Ins.size();
8556 Function::const_arg_iterator CurOrigArg = F.arg_begin();
8557 unsigned CurArgIdx = 0;
8558 bool UseVarArgCC = false;
8559 if (IsWin64)
8560 UseVarArgCC = isVarArg;
8561
8562 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
8563
8564 for (unsigned i = 0; i != NumArgs; ++i) {
8565 MVT ValVT = Ins[i].VT;
8566 if (Ins[i].isOrigArg()) {
8567 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
8568 CurArgIdx = Ins[i].getOrigArgIndex();
8569
8570 // Get type of the original argument.
8571 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
8572 /*AllowUnknown*/ true);
8573 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
8574 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8575 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8576 ValVT = MVT::i8;
8577 else if (ActualMVT == MVT::i16)
8578 ValVT = MVT::i16;
8579 }
8580 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags,
8581 Ins[i].OrigTy, CCInfo);
8582 assert(!Res && "Call operand has unhandled type");
8583 (void)Res;
8584 }
8585
8586 SMEAttrs Attrs = FuncInfo->getSMEFnAttrs();
8587 bool IsLocallyStreaming =
8588 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
8589 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8590 SDValue Glue = Chain.getValue(1);
8591
8592 unsigned ExtraArgLocs = 0;
8593 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
8594 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8595
8596 if (Ins[i].Flags.isByVal()) {
8597 // Byval is used for HFAs in the PCS, but the system should work in a
8598 // non-compliant manner for larger structs.
8599 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8600 int Size = Ins[i].Flags.getByValSize();
8601 unsigned NumRegs = (Size + 7) / 8;
8602
8603 // FIXME: This works on big-endian for composite byvals, which are the common
8604 // case. It should also work for fundamental types too.
8605 unsigned FrameIdx =
8606 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
8607 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
8608 InVals.push_back(FrameIdxN);
8609
8610 continue;
8611 }
8612
8613 if (Ins[i].Flags.isSwiftAsync())
8614 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
8615
8616 SDValue ArgValue;
8617 if (VA.isRegLoc()) {
8618 // Arguments stored in registers.
8619 EVT RegVT = VA.getLocVT();
8620 const TargetRegisterClass *RC;
8621
8622 if (RegVT == MVT::i32)
8623 RC = &AArch64::GPR32RegClass;
8624 else if (RegVT == MVT::i64)
8625 RC = &AArch64::GPR64RegClass;
8626 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
8627 RC = &AArch64::FPR16RegClass;
8628 else if (RegVT == MVT::f32)
8629 RC = &AArch64::FPR32RegClass;
8630 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
8631 RC = &AArch64::FPR64RegClass;
8632 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
8633 RC = &AArch64::FPR128RegClass;
8634 else if (RegVT.isScalableVector() &&
8635 RegVT.getVectorElementType() == MVT::i1) {
8636 FuncInfo->setIsSVECC(true);
8637 RC = &AArch64::PPRRegClass;
8638 } else if (RegVT == MVT::aarch64svcount) {
8639 FuncInfo->setIsSVECC(true);
8640 RC = &AArch64::PPRRegClass;
8641 } else if (RegVT.isScalableVector()) {
8642 FuncInfo->setIsSVECC(true);
8643 RC = &AArch64::ZPRRegClass;
8644 } else
8645 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
8646
8647 // Transform the arguments in physical registers into virtual ones.
8648 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
8649
8650 if (IsLocallyStreaming) {
8651 // LocallyStreamingFunctions must insert the SMSTART in the correct
8652 // position, so we use Glue to ensure no instructions can be scheduled
8653 // between the chain of:
8654 // t0: ch,glue = EntryNode
8655 // t1: res,ch,glue = CopyFromReg
8656 // ...
8657 // tn: res,ch,glue = CopyFromReg t(n-1), ..
8658 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
8659 // ^^^^^^
8660 // This will be the new Chain/Root node.
8661 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
8662 Glue = ArgValue.getValue(2);
8663 if (isPassedInFPR(ArgValue.getValueType())) {
8664 ArgValue =
8665 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8666 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
8667 {ArgValue, Glue});
8668 Glue = ArgValue.getValue(1);
8669 }
8670 } else
8671 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
8672
8673 // If this is an 8, 16 or 32-bit value, it is really passed promoted
8674 // to 64 bits. Insert an assert[sz]ext to capture this, then
8675 // truncate to the right size.
8676 switch (VA.getLocInfo()) {
8677 default:
8678 llvm_unreachable("Unknown loc info!");
8679 case CCValAssign::Full:
8680 break;
8682 assert(
8683 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8684 "Indirect arguments should be scalable on most subtargets");
8685 break;
8686 case CCValAssign::BCvt:
8687 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
8688 break;
8689 case CCValAssign::AExt:
8690 case CCValAssign::SExt:
8691 case CCValAssign::ZExt:
8692 break;
8694 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
8695 DAG.getConstant(32, DL, RegVT));
8696 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
8697 break;
8698 }
8699 } else { // VA.isRegLoc()
8700 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
8701 unsigned ArgOffset = VA.getLocMemOffset();
8702 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
8703 ? VA.getLocVT().getSizeInBits()
8704 : VA.getValVT().getSizeInBits()) / 8;
8705
8706 uint32_t BEAlign = 0;
8707 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
8708 !Ins[i].Flags.isInConsecutiveRegs())
8709 BEAlign = 8 - ArgSize;
8710
8711 SDValue FIN;
8712 MachinePointerInfo PtrInfo;
8713 if (StackViaX4) {
8714 // In both the ARM64EC varargs convention and the thunk convention,
8715 // arguments on the stack are accessed relative to x4, not sp. In
8716 // the thunk convention, there's an additional offset of 32 bytes
8717 // to account for the shadow store.
8718 unsigned ObjOffset = ArgOffset + BEAlign;
8719 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8720 ObjOffset += 32;
8721 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8722 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8723 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
8724 DAG.getConstant(ObjOffset, DL, MVT::i64));
8726 } else {
8727 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
8728
8729 // Create load nodes to retrieve arguments from the stack.
8730 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
8731 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
8732 }
8733
8734 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
8736 MVT MemVT = VA.getValVT();
8737
8738 switch (VA.getLocInfo()) {
8739 default:
8740 break;
8741 case CCValAssign::Trunc:
8742 case CCValAssign::BCvt:
8743 MemVT = VA.getLocVT();
8744 break;
8747 Subtarget->isWindowsArm64EC()) &&
8748 "Indirect arguments should be scalable on most subtargets");
8749 MemVT = VA.getLocVT();
8750 break;
8751 case CCValAssign::SExt:
8752 ExtType = ISD::SEXTLOAD;
8753 break;
8754 case CCValAssign::ZExt:
8755 ExtType = ISD::ZEXTLOAD;
8756 break;
8757 case CCValAssign::AExt:
8758 ExtType = ISD::EXTLOAD;
8759 break;
8760 }
8761
8762 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
8763 MemVT);
8764 }
8765
8766 if (VA.getLocInfo() == CCValAssign::Indirect) {
8767 assert((VA.getValVT().isScalableVT() ||
8768 Subtarget->isWindowsArm64EC()) &&
8769 "Indirect arguments should be scalable on most subtargets");
8770
8771 TypeSize PartSize = VA.getValVT().getStoreSize();
8772 unsigned NumParts = 1;
8773 if (Ins[i].Flags.isInConsecutiveRegs()) {
8774 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8775 ++NumParts;
8776 }
8777
8778 MVT PartLoad = VA.getValVT();
8779 SDValue Ptr = ArgValue;
8780
8781 // Ensure we generate all loads for each tuple part, whilst updating the
8782 // pointer after each load correctly using vscale.
8783 while (NumParts > 0) {
8784 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
8785 InVals.push_back(ArgValue);
8786 NumParts--;
8787 if (NumParts > 0) {
8788 SDValue BytesIncrement =
8789 DAG.getTypeSize(DL, Ptr.getValueType(), PartSize);
8790 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8791 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
8792 ExtraArgLocs++;
8793 i++;
8794 }
8795 }
8796 } else {
8797 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
8798 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
8799 ArgValue, DAG.getValueType(MVT::i32));
8800
8801 // i1 arguments are zero-extended to i8 by the caller. Emit a
8802 // hint to reflect this.
8803 if (Ins[i].isOrigArg()) {
8804 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
8805 if (OrigArg->getType()->isIntegerTy(1)) {
8806 if (!Ins[i].Flags.isZExt()) {
8807 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
8808 ArgValue.getValueType(), ArgValue);
8809 }
8810 }
8811 }
8812
8813 InVals.push_back(ArgValue);
8814 }
8815 }
8816 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
8817
8818 if (Attrs.hasStreamingCompatibleInterface()) {
8819 SDValue EntryPStateSM =
8820 DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL,
8821 DAG.getVTList(MVT::i64, MVT::Other), {Chain});
8822
8823 // Copy the value to a virtual register, and save that in FuncInfo.
8824 Register EntryPStateSMReg =
8825 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8826 Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg,
8827 EntryPStateSM);
8828 FuncInfo->setPStateSMReg(EntryPStateSMReg);
8829 }
8830
8831 // Insert the SMSTART if this is a locally streaming function and
8832 // make sure it is Glued to the last CopyFromReg value.
8833 if (IsLocallyStreaming) {
8834 if (Attrs.hasStreamingCompatibleInterface())
8835 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8837 else
8838 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8840
8841 // Ensure that the SMSTART happens after the CopyWithChain such that its
8842 // chain result is used.
8843 for (unsigned I=0; I<InVals.size(); ++I) {
8846 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
8847 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
8848 InVals[I].getValueType());
8849 }
8850 }
8851
8852 // varargs
8853 if (isVarArg) {
8855 if (!Subtarget->isTargetDarwin() || IsWin64) {
8856 // The AAPCS variadic function ABI is identical to the non-variadic
8857 // one. As a result there may be more arguments in registers and we
8858 // should save them for future reference.
8859 // Win64 variadic functions also pass arguments in registers, but all
8860 // float arguments are passed in integer registers.
8861 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
8862 }
8863
8864 // This will point to the next argument passed via stack.
8865 unsigned VarArgsOffset = CCInfo.getStackSize();
8866 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
8867 VarArgsOffset =
8868 alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
8869 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
8870 FuncInfo->setVarArgsStackIndex(
8871 MFI.CreateFixedObject(4, VarArgsOffset, true));
8872 }
8873
8874 if (MFI.hasMustTailInVarArgFunc()) {
8875 SmallVector<MVT, 2> RegParmTypes;
8876 RegParmTypes.push_back(MVT::i64);
8877 RegParmTypes.push_back(MVT::f128);
8878 // Compute the set of forwarded registers. The rest are scratch.
8879 SmallVectorImpl<ForwardedRegister> &Forwards =
8880 FuncInfo->getForwardedMustTailRegParms();
8881 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
8883
8884 // Conservatively forward X8, since it might be used for aggregate return.
8885 if (!CCInfo.isAllocated(AArch64::X8)) {
8886 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
8887 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
8888 }
8889 }
8890 }
8891
8892 // On Windows, InReg pointers must be returned, so record the pointer in a
8893 // virtual register at the start of the function so it can be returned in the
8894 // epilogue.
8895 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
8896 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
8897 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
8898 Ins[I].Flags.isInReg()) &&
8899 Ins[I].Flags.isSRet()) {
8900 assert(!FuncInfo->getSRetReturnReg());
8901
8902 MVT PtrTy = getPointerTy(DAG.getDataLayout());
8903 Register Reg =
8905 FuncInfo->setSRetReturnReg(Reg);
8906
8907 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
8908 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
8909 break;
8910 }
8911 }
8912 }
8913
8914 unsigned StackArgSize = CCInfo.getStackSize();
8915 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8916 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
8917 // This is a non-standard ABI so by fiat I say we're allowed to make full
8918 // use of the stack area to be popped, which must be aligned to 16 bytes in
8919 // any case:
8920 StackArgSize = alignTo(StackArgSize, 16);
8921
8922 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
8923 // a multiple of 16.
8924 FuncInfo->setArgumentStackToRestore(StackArgSize);
8925
8926 // This realignment carries over to the available bytes below. Our own
8927 // callers will guarantee the space is free by giving an aligned value to
8928 // CALLSEQ_START.
8929 }
8930 // Even if we're not expected to free up the space, it's useful to know how
8931 // much is there while considering tail calls (because we can reuse it).
8932 FuncInfo->setBytesInStackArgArea(StackArgSize);
8933
8934 if (Subtarget->hasCustomCallingConv())
8935 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
8936
8937 if (getTM().useNewSMEABILowering()) {
8938 if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
8939 SDValue Size;
8940 if (Attrs.hasZAState()) {
8941 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8942 DAG.getConstant(1, DL, MVT::i32));
8943 Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8944 } else if (Attrs.hasAgnosticZAInterface()) {
8945 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
8948 auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
8949 TargetLowering::CallLoweringInfo CLI(DAG);
8950 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8951 getLibcallCallingConv(LC), RetTy, Callee, {});
8952 std::tie(Size, Chain) = LowerCallTo(CLI);
8953 }
8954 if (Size) {
8955 SDValue Buffer = DAG.getNode(
8956 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8957 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8958 Chain = Buffer.getValue(1);
8959
8960 Register BufferPtr =
8961 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8962 Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
8963 Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
8964 DAG.getVTList(MVT::Other), Chain);
8965 FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
8966 MFI.CreateVariableSizedObject(Align(16), nullptr);
8967 }
8968 }
8969 } else {
8970 // Old SME ABI lowering (deprecated):
8971 // Create a 16 Byte TPIDR2 object. The dynamic buffer
8972 // will be expanded and stored in the static object later using a
8973 // pseudonode.
8974 if (Attrs.hasZAState()) {
8975 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8976 TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
8977 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8978 DAG.getConstant(1, DL, MVT::i32));
8979 SDValue Buffer;
8980 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8981 Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
8982 DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
8983 } else {
8984 SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8985 Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
8986 DAG.getVTList(MVT::i64, MVT::Other),
8987 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8988 MFI.CreateVariableSizedObject(Align(16), nullptr);
8989 }
8990 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8991 DAG.getConstant(1, DL, MVT::i32));
8992 Chain = DAG.getNode(
8993 AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
8994 {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0),
8995 /*Num save slices*/ NumZaSaveSlices});
8996 } else if (Attrs.hasAgnosticZAInterface()) {
8997 // Call __arm_sme_state_size().
8998 SDValue BufferSize =
8999 DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL,
9000 DAG.getVTList(MVT::i64, MVT::Other), Chain);
9001 Chain = BufferSize.getValue(1);
9002 SDValue Buffer;
9003 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
9004 Buffer = DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,
9005 DAG.getVTList(MVT::i64, MVT::Other),
9006 {Chain, BufferSize});
9007 } else {
9008 // Allocate space dynamically.
9009 Buffer = DAG.getNode(
9010 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
9011 {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
9012 MFI.CreateVariableSizedObject(Align(16), nullptr);
9013 }
9014 // Copy the value to a virtual register, and save that in FuncInfo.
9015 Register BufferPtr =
9016 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
9017 FuncInfo->setSMESaveBufferAddr(BufferPtr);
9018 Chain = DAG.getCopyToReg(Buffer.getValue(1), DL, BufferPtr, Buffer);
9019 }
9020 }
9021
9022 if (CallConv == CallingConv::PreserveNone) {
9023 for (const ISD::InputArg &I : Ins) {
9024 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
9025 I.Flags.isSwiftAsync()) {
9026 MachineFunction &MF = DAG.getMachineFunction();
9027 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9028 MF.getFunction(),
9029 "Swift attributes can't be used with preserve_none",
9030 DL.getDebugLoc()));
9031 break;
9032 }
9033 }
9034 }
9035
9036 return Chain;
9037}
9038
9039void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
9040 SelectionDAG &DAG,
9041 const SDLoc &DL,
9042 SDValue &Chain) const {
9043 MachineFunction &MF = DAG.getMachineFunction();
9044 MachineFrameInfo &MFI = MF.getFrameInfo();
9045 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9046 auto PtrVT = getPointerTy(DAG.getDataLayout());
9047 Function &F = MF.getFunction();
9048 bool IsWin64 =
9049 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
9050
9052
9054 unsigned NumGPRArgRegs = GPRArgRegs.size();
9055 if (Subtarget->isWindowsArm64EC()) {
9056 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
9057 // functions.
9058 NumGPRArgRegs = 4;
9059 }
9060 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
9061
9062 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
9063 int GPRIdx = 0;
9064 if (GPRSaveSize != 0) {
9065 if (IsWin64) {
9066 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
9067 if (GPRSaveSize & 15)
9068 // The extra size here, if triggered, will always be 8.
9069 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
9070 } else
9071 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
9072
9073 SDValue FIN;
9074 if (Subtarget->isWindowsArm64EC()) {
9075 // With the Arm64EC ABI, we reserve the save area as usual, but we
9076 // compute its address relative to x4. For a normal AArch64->AArch64
9077 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
9078 // different address.
9079 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
9080 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
9081 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
9082 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
9083 } else {
9084 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
9085 }
9086
9087 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
9088 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
9089 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
9090 SDValue Store =
9091 DAG.getStore(Val.getValue(1), DL, Val, FIN,
9093 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
9094 : MachinePointerInfo::getStack(MF, i * 8));
9095 MemOps.push_back(Store);
9096 FIN =
9097 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
9098 }
9099 }
9100 FuncInfo->setVarArgsGPRIndex(GPRIdx);
9101 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
9102
9103 if (Subtarget->hasFPARMv8() && !IsWin64) {
9105 const unsigned NumFPRArgRegs = FPRArgRegs.size();
9106 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
9107
9108 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
9109 int FPRIdx = 0;
9110 if (FPRSaveSize != 0) {
9111 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
9112
9113 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
9114
9115 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
9116 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
9117 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
9118
9119 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
9120 MachinePointerInfo::getStack(MF, i * 16));
9121 MemOps.push_back(Store);
9122 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
9123 DAG.getConstant(16, DL, PtrVT));
9124 }
9125 }
9126 FuncInfo->setVarArgsFPRIndex(FPRIdx);
9127 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
9128 }
9129
9130 if (!MemOps.empty()) {
9131 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
9132 }
9133}
9134
9135/// LowerCallResult - Lower the result values of a call into the
9136/// appropriate copies out of appropriate physical registers.
9137SDValue AArch64TargetLowering::LowerCallResult(
9138 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
9139 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
9140 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
9141 SDValue ThisVal, bool RequiresSMChange) const {
9142 DenseMap<unsigned, SDValue> CopiedRegs;
9143 // Copy all of the result registers out of their specified physreg.
9144 for (unsigned i = 0; i != RVLocs.size(); ++i) {
9145 CCValAssign VA = RVLocs[i];
9146
9147 // Pass 'this' value directly from the argument to return value, to avoid
9148 // reg unit interference
9149 if (i == 0 && isThisReturn) {
9150 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
9151 "unexpected return calling convention register assignment");
9152 InVals.push_back(ThisVal);
9153 continue;
9154 }
9155
9156 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
9157 // allows one use of a physreg per block.
9158 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
9159 if (!Val) {
9160 Val =
9161 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
9162 Chain = Val.getValue(1);
9163 InGlue = Val.getValue(2);
9164 CopiedRegs[VA.getLocReg()] = Val;
9165 }
9166
9167 switch (VA.getLocInfo()) {
9168 default:
9169 llvm_unreachable("Unknown loc info!");
9170 case CCValAssign::Full:
9171 break;
9172 case CCValAssign::BCvt:
9173 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
9174 break;
9176 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
9177 DAG.getConstant(32, DL, VA.getLocVT()));
9178 [[fallthrough]];
9179 case CCValAssign::AExt:
9180 [[fallthrough]];
9181 case CCValAssign::ZExt:
9182 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
9183 break;
9184 }
9185
9186 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
9187 Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9188 DAG.getVTList(Val.getValueType(), MVT::Glue), Val);
9189
9190 InVals.push_back(Val);
9191 }
9192
9193 return Chain;
9194}
9195
9196/// Return true if the calling convention is one that we can guarantee TCO for.
9197static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
9198 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
9200}
9201
9202/// Return true if we might ever do TCO for calls with this calling convention.
9204 switch (CC) {
9205 case CallingConv::C:
9210 case CallingConv::Swift:
9212 case CallingConv::Tail:
9213 case CallingConv::Fast:
9214 return true;
9215 default:
9216 return false;
9217 }
9218}
9219
9220/// Return true if the call convention supports varargs
9221/// Currently only those that pass varargs like the C
9222/// calling convention does are eligible
9223/// Calling conventions listed in this function must also
9224/// be properly handled in AArch64Subtarget::isCallingConvWin64
9226 switch (CC) {
9227 case CallingConv::C:
9229 // SVE vector call is only partially supported, but it should
9230 // support named arguments being passed. Any arguments being passed
9231 // as varargs, are still unsupported.
9233 return true;
9234 default:
9235 return false;
9236 }
9237}
9238
9240 const AArch64Subtarget *Subtarget,
9242 CCState &CCInfo) {
9243 const SelectionDAG &DAG = CLI.DAG;
9244 CallingConv::ID CalleeCC = CLI.CallConv;
9245 bool IsVarArg = CLI.IsVarArg;
9246 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9247 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
9248
9249 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
9250 // for the shadow store.
9251 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
9252 CCInfo.AllocateStack(32, Align(16));
9253
9254 unsigned NumArgs = Outs.size();
9255 for (unsigned i = 0; i != NumArgs; ++i) {
9256 MVT ArgVT = Outs[i].VT;
9257 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
9258
9259 bool UseVarArgCC = false;
9260 if (IsVarArg) {
9261 // On Windows, the fixed arguments in a vararg call are passed in GPRs
9262 // too, so use the vararg CC to force them to integer registers.
9263 if (IsCalleeWin64) {
9264 UseVarArgCC = true;
9265 } else {
9266 UseVarArgCC = ArgFlags.isVarArg();
9267 }
9268 }
9269
9270 if (!UseVarArgCC) {
9271 // Get type of the original argument.
9272 EVT ActualVT =
9273 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
9274 /*AllowUnknown*/ true);
9275 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
9276 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
9277 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
9278 ArgVT = MVT::i8;
9279 else if (ActualMVT == MVT::i16)
9280 ArgVT = MVT::i16;
9281 }
9282
9283 // FIXME: CCAssignFnForCall should be called once, for the call and not per
9284 // argument. This logic should exactly mirror LowerFormalArguments.
9285 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
9286 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
9287 Outs[i].OrigTy, CCInfo);
9288 assert(!Res && "Call operand has unhandled type");
9289 (void)Res;
9290 }
9291}
9292
9293static SMECallAttrs
9296 if (CLI.CB)
9297 return SMECallAttrs(*CLI.CB, &RTLCI);
9298 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9299 return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), RTLCI));
9301}
9302
9303bool AArch64TargetLowering::isEligibleForTailCallOptimization(
9304 const CallLoweringInfo &CLI) const {
9305 CallingConv::ID CalleeCC = CLI.CallConv;
9306 if (!mayTailCallThisCC(CalleeCC))
9307 return false;
9308
9309 SDValue Callee = CLI.Callee;
9310 bool IsVarArg = CLI.IsVarArg;
9311 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9312 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9313 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
9314 const SelectionDAG &DAG = CLI.DAG;
9315 MachineFunction &MF = DAG.getMachineFunction();
9316 const Function &CallerF = MF.getFunction();
9317 CallingConv::ID CallerCC = CallerF.getCallingConv();
9318
9319 // SME Streaming functions are not eligible for TCO as they may require
9320 // the streaming mode or ZA/ZT0 to be restored after returning from the call.
9321 SMECallAttrs CallAttrs =
9322 getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI);
9323 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
9324 CallAttrs.requiresPreservingAllZAState() ||
9325 CallAttrs.requiresPreservingZT0() ||
9326 CallAttrs.caller().hasStreamingBody())
9327 return false;
9328
9329 // Functions using the C or Fast calling convention that have an SVE signature
9330 // preserve more registers and should assume the SVE_VectorCall CC.
9331 // The check for matching callee-saved regs will determine whether it is
9332 // eligible for TCO.
9333 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
9334 MF.getInfo<AArch64FunctionInfo>()->isSVECC())
9336
9337 bool CCMatch = CallerCC == CalleeCC;
9338
9339 // When using the Windows calling convention on a non-windows OS, we want
9340 // to back up and restore X18 in such functions; we can't do a tail call
9341 // from those functions.
9342 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
9343 CalleeCC != CallingConv::Win64)
9344 return false;
9345
9346 // Byval parameters hand the function a pointer directly into the stack area
9347 // we want to reuse during a tail call. Working around this *is* possible (see
9348 // X86) but less efficient and uglier in LowerCall.
9349 for (Function::const_arg_iterator i = CallerF.arg_begin(),
9350 e = CallerF.arg_end();
9351 i != e; ++i) {
9352 if (i->hasByValAttr())
9353 return false;
9354
9355 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
9356 // In this case, it is necessary to save X0/X1 in the callee and return it
9357 // in X0. Tail call opt may interfere with this, so we disable tail call
9358 // opt when the caller has an "inreg" attribute -- except if the callee
9359 // also has that attribute on the same argument, and the same value is
9360 // passed.
9361 if (i->hasInRegAttr()) {
9362 unsigned ArgIdx = i - CallerF.arg_begin();
9363 if (!CLI.CB || CLI.CB->arg_size() <= ArgIdx)
9364 return false;
9365 AttributeSet Attrs = CLI.CB->getParamAttributes(ArgIdx);
9366 if (!Attrs.hasAttribute(Attribute::InReg) ||
9367 !Attrs.hasAttribute(Attribute::StructRet) || !i->hasStructRetAttr() ||
9368 CLI.CB->getArgOperand(ArgIdx) != i) {
9369 return false;
9370 }
9371 }
9372 }
9373
9374 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
9375 return CCMatch;
9376
9377 // Externally-defined functions with weak linkage should not be
9378 // tail-called on AArch64 when the OS does not support dynamic
9379 // pre-emption of symbols, as the AAELF spec requires normal calls
9380 // to undefined weak functions to be replaced with a NOP or jump to the
9381 // next instruction. The behaviour of branch instructions in this
9382 // situation (as used for tail calls) is implementation-defined, so we
9383 // cannot rely on the linker replacing the tail call with a return.
9384 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9385 const GlobalValue *GV = G->getGlobal();
9386 const Triple &TT = getTargetMachine().getTargetTriple();
9387 if (GV->hasExternalWeakLinkage() &&
9388 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
9389 return false;
9390 }
9391
9392 // Now we search for cases where we can use a tail call without changing the
9393 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
9394 // concept.
9395
9396 // I want anyone implementing a new calling convention to think long and hard
9397 // about this assert.
9398 if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
9399 report_fatal_error("Unsupported variadic calling convention");
9400
9401 LLVMContext &C = *DAG.getContext();
9402 // Check that the call results are passed in the same way.
9403 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
9404 CCAssignFnForCall(CalleeCC, IsVarArg),
9405 CCAssignFnForCall(CallerCC, IsVarArg)))
9406 return false;
9407 // The callee has to preserve all registers the caller needs to preserve.
9408 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9409 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
9410 if (!CCMatch) {
9411 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
9412 if (Subtarget->hasCustomCallingConv()) {
9413 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
9414 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
9415 }
9416 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
9417 return false;
9418 }
9419
9420 // Nothing more to check if the callee is taking no arguments
9421 if (Outs.empty())
9422 return true;
9423
9425 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
9426
9427 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9428
9429 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
9430 // When we are musttail, additional checks have been done and we can safely ignore this check
9431 // At least two cases here: if caller is fastcc then we can't have any
9432 // memory arguments (we'd be expected to clean up the stack afterwards). If
9433 // caller is C then we could potentially use its argument area.
9434
9435 // FIXME: for now we take the most conservative of these in both cases:
9436 // disallow all variadic memory operands.
9437 for (const CCValAssign &ArgLoc : ArgLocs)
9438 if (!ArgLoc.isRegLoc())
9439 return false;
9440 }
9441
9442 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9443
9444 // If any of the arguments is passed indirectly, it must be SVE, so the
9445 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
9446 // allocate space on the stack. That is why we determine this explicitly here
9447 // the call cannot be a tailcall.
9448 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
9449 assert((A.getLocInfo() != CCValAssign::Indirect ||
9450 A.getValVT().isScalableVector() ||
9451 Subtarget->isWindowsArm64EC()) &&
9452 "Expected value to be scalable");
9453 return A.getLocInfo() == CCValAssign::Indirect;
9454 }))
9455 return false;
9456
9457 // If the stack arguments for this call do not fit into our own save area then
9458 // the call cannot be made tail.
9459 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
9460 return false;
9461
9462 const MachineRegisterInfo &MRI = MF.getRegInfo();
9463 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
9464 return false;
9465
9466 return true;
9467}
9468
9469SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
9470 SelectionDAG &DAG,
9471 MachineFrameInfo &MFI,
9472 int ClobberedFI) const {
9473 SmallVector<SDValue, 8> ArgChains;
9474 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
9475 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
9476
9477 // Include the original chain at the beginning of the list. When this is
9478 // used by target LowerCall hooks, this helps legalize find the
9479 // CALLSEQ_BEGIN node.
9480 ArgChains.push_back(Chain);
9481
9482 // Add a chain value for each stack argument corresponding
9483 for (SDNode *U : DAG.getEntryNode().getNode()->users())
9484 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
9485 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
9486 if (FI->getIndex() < 0) {
9487 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
9488 int64_t InLastByte = InFirstByte;
9489 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
9490
9491 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
9492 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
9493 ArgChains.push_back(SDValue(L, 1));
9494 }
9495
9496 // Build a tokenfactor for all the chains.
9497 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
9498}
9499
9500bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
9501 bool TailCallOpt) const {
9502 return (CallCC == CallingConv::Fast && TailCallOpt) ||
9503 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
9504}
9505
9506// Check if the value is zero-extended from i1 to i8
9507static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
9508 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
9509 if (SizeInBits < 8)
9510 return false;
9511
9512 APInt RequiredZero(SizeInBits, 0xFE);
9513 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
9514 bool ZExtBool = (Bits.Zero & RequiredZero) == RequiredZero;
9515 return ZExtBool;
9516}
9517
9518void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
9519 SDNode *Node) const {
9520 // Live-in physreg copies that are glued to SMSTART are applied as
9521 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
9522 // register allocator to pass call args in callee saved regs, without extra
9523 // copies to avoid these fake clobbers of actually-preserved GPRs.
9524 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
9525 MI.getOpcode() == AArch64::MSRpstatePseudo) {
9526 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
9527 if (MachineOperand &MO = MI.getOperand(I);
9528 MO.isReg() && MO.isImplicit() && MO.isDef() &&
9529 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
9530 AArch64::GPR64RegClass.contains(MO.getReg())))
9531 MI.removeOperand(I);
9532
9533 // The SVE vector length can change when entering/leaving streaming mode.
9534 // FPMR is set to 0 when entering/leaving streaming mode.
9535 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
9536 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
9537 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9538 /*IsImplicit=*/true));
9539 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
9540 /*IsImplicit=*/true));
9541 MI.addOperand(MachineOperand::CreateReg(AArch64::FPMR, /*IsDef=*/true,
9542 /*IsImplicit=*/true));
9543 }
9544 }
9545
9546 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
9547 // have nothing to do with VG, were it not that they are used to materialise a
9548 // frame-address. If they contain a frame-index to a scalable vector, this
9549 // will likely require an ADDVL instruction to materialise the address, thus
9550 // reading VG.
9551 const MachineFunction &MF = *MI.getMF();
9552 if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
9553 (MI.getOpcode() == AArch64::ADDXri ||
9554 MI.getOpcode() == AArch64::SUBXri)) {
9555 const MachineOperand &MO = MI.getOperand(1);
9556 if (MO.isFI() && MF.getFrameInfo().hasScalableStackID(MO.getIndex()))
9557 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9558 /*IsImplicit=*/true));
9559 }
9560}
9561
9563 SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue,
9564 unsigned Condition, bool InsertVectorLengthCheck) const {
9567 FuncInfo->setHasStreamingModeChanges(true);
9568
9569 auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue {
9570 SmallVector<SDValue, 2> Ops = {Chain};
9571 if (InGlue)
9572 Ops.push_back(InGlue);
9573 return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL,
9574 DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9575 };
9576
9577 if (InsertVectorLengthCheck && Enable) {
9578 // Non-streaming -> Streaming
9579 // Insert vector length check before smstart
9580 SDValue CheckVL = GetCheckVL(Chain, InGlue);
9581 Chain = CheckVL.getValue(0);
9582 InGlue = CheckVL.getValue(1);
9583 }
9584
9585 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9586 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
9587 SDValue MSROp =
9588 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
9589 SmallVector<SDValue> Ops = {Chain, MSROp};
9590 unsigned Opcode;
9591 if (Condition != AArch64SME::Always) {
9592 Register PStateReg = FuncInfo->getPStateSMReg();
9593 assert(PStateReg.isValid() && "PStateSM Register is invalid");
9594 SDValue PStateSM =
9595 DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue);
9596 // Use chain and glue from the CopyFromReg.
9597 Ops[0] = PStateSM.getValue(1);
9598 InGlue = PStateSM.getValue(2);
9599 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
9600 Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
9601 Ops.push_back(ConditionOp);
9602 Ops.push_back(PStateSM);
9603 } else {
9604 Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
9605 }
9606 Ops.push_back(RegMask);
9607
9608 if (InGlue)
9609 Ops.push_back(InGlue);
9610
9611 SDValue SMChange =
9612 DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9613
9614 if (!InsertVectorLengthCheck || Enable)
9615 return SMChange;
9616
9617 // Streaming -> Non-streaming
9618 // Insert vector length check after smstop since we cannot read VL
9619 // in streaming mode
9620 return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1));
9621}
9622
9625 if (!CallAttrs.caller().hasStreamingCompatibleInterface() ||
9626 CallAttrs.caller().hasStreamingBody())
9627 return AArch64SME::Always;
9628 if (CallAttrs.callee().hasNonStreamingInterface())
9630 if (CallAttrs.callee().hasStreamingInterface())
9632
9633 llvm_unreachable("Unsupported attributes");
9634}
9635
9636/// Check whether a stack argument requires lowering in a tail call.
9638 const CCValAssign &VA, SDValue Arg,
9639 ISD::ArgFlagsTy Flags, int CallOffset) {
9640 // FIXME: We should be able to handle this case, but it's not clear how to.
9641 if (Flags.isZExt() || Flags.isSExt())
9642 return true;
9643
9644 for (;;) {
9645 // Look through nodes that don't alter the bits of the incoming value.
9646 unsigned Op = Arg.getOpcode();
9647 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
9648 Arg->isAssert() || Op == AArch64ISD::ASSERT_ZEXT_BOOL) {
9649 Arg = Arg.getOperand(0);
9650 continue;
9651 }
9652 break;
9653 }
9654
9655 // If the argument is a load from the same immutable stack slot, we can reuse
9656 // it.
9657 if (auto *LoadNode = dyn_cast<LoadSDNode>(Arg)) {
9658 if (auto *FINode = dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) {
9659 const MachineFrameInfo &MFI = MF.getFrameInfo();
9660 int FI = FINode->getIndex();
9661 if (!MFI.isImmutableObjectIndex(FI))
9662 return true;
9663 if (CallOffset != MFI.getObjectOffset(FI))
9664 return true;
9665 uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
9666 if (SizeInBits / 8 != static_cast<uint64_t>(MFI.getObjectSize(FI)))
9667 return true;
9668 return false;
9669 }
9670 }
9671
9672 return true;
9673}
9674
9675/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
9676/// and add input and output parameter nodes.
9677SDValue
9678AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
9679 SmallVectorImpl<SDValue> &InVals) const {
9680 SelectionDAG &DAG = CLI.DAG;
9681 SDLoc &DL = CLI.DL;
9682 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9683 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9684 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
9685 SDValue Chain = CLI.Chain;
9686 SDValue Callee = CLI.Callee;
9687 bool &IsTailCall = CLI.IsTailCall;
9688 CallingConv::ID &CallConv = CLI.CallConv;
9689 bool IsVarArg = CLI.IsVarArg;
9690 const CallBase *CB = CLI.CB;
9691
9692 MachineFunction &MF = DAG.getMachineFunction();
9693 MachineFunction::CallSiteInfo CSInfo;
9694 bool IsThisReturn = false;
9695
9696 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9697 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
9698 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
9699 bool IsSibCall = false;
9700 bool GuardWithBTI = false;
9701
9702 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
9703 !Subtarget->noBTIAtReturnTwice()) {
9704 GuardWithBTI = FuncInfo->branchTargetEnforcement();
9705 }
9706
9707 // Analyze operands of the call, assigning locations to each operand.
9709 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
9710
9711 if (IsVarArg) {
9712 unsigned NumArgs = Outs.size();
9713
9714 for (unsigned i = 0; i != NumArgs; ++i) {
9715 if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector())
9716 report_fatal_error("Passing SVE types to variadic functions is "
9717 "currently not supported");
9718 }
9719 }
9720
9721 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9722
9723 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9724 // Assign locations to each value returned by this call.
9726 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
9727 *DAG.getContext());
9728 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
9729
9730 // Set type id for call site info.
9731 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
9732 CSInfo = MachineFunction::CallSiteInfo(*CB);
9733
9734 // Check callee args/returns for SVE registers and set calling convention
9735 // accordingly.
9736 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
9737 auto HasSVERegLoc = [](CCValAssign &Loc) {
9738 if (!Loc.isRegLoc())
9739 return false;
9740 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
9741 AArch64::PPRRegClass.contains(Loc.getLocReg());
9742 };
9743 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
9745 }
9746
9747 // Determine whether we need any streaming mode changes.
9748 SMECallAttrs CallAttrs =
9750
9751 std::optional<unsigned> ZAMarkerNode;
9752 bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
9753
9754 if (UseNewSMEABILowering) {
9755 if (CallAttrs.requiresLazySave() ||
9756 CallAttrs.requiresPreservingAllZAState())
9757 ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;
9758 else if (CallAttrs.requiresPreservingZT0())
9759 ZAMarkerNode = AArch64ISD::REQUIRES_ZT0_SAVE;
9760 else if (CallAttrs.caller().hasZAState() ||
9761 CallAttrs.caller().hasZT0State())
9762 ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;
9763 }
9764
9765 if (IsTailCall) {
9766 // Check if it's really possible to do a tail call.
9767 IsTailCall = isEligibleForTailCallOptimization(CLI);
9768
9769 // A sibling call is one where we're under the usual C ABI and not planning
9770 // to change that but can still do a tail call:
9771 if (!ZAMarkerNode && !TailCallOpt && IsTailCall &&
9772 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
9773 IsSibCall = true;
9774
9775 if (IsTailCall)
9776 ++NumTailCalls;
9777 }
9778
9779 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
9780 report_fatal_error("failed to perform tail call elimination on a call "
9781 "site marked musttail");
9782
9783 // Get a count of how many bytes are to be pushed on the stack.
9784 unsigned NumBytes = CCInfo.getStackSize();
9785
9786 if (IsSibCall) {
9787 // Since we're not changing the ABI to make this a tail call, the memory
9788 // operands are already available in the caller's incoming argument space.
9789 NumBytes = 0;
9790 }
9791
9792 // FPDiff is the byte offset of the call's argument area from the callee's.
9793 // Stores to callee stack arguments will be placed in FixedStackSlots offset
9794 // by this amount for a tail call. In a sibling call it must be 0 because the
9795 // caller will deallocate the entire stack and the callee still expects its
9796 // arguments to begin at SP+0. Completely unused for non-tail calls.
9797 int FPDiff = 0;
9798
9799 if (IsTailCall && !IsSibCall) {
9800 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
9801
9802 // Since callee will pop argument stack as a tail call, we must keep the
9803 // popped size 16-byte aligned.
9804 NumBytes = alignTo(NumBytes, 16);
9805
9806 // FPDiff will be negative if this tail call requires more space than we
9807 // would automatically have in our incoming argument space. Positive if we
9808 // can actually shrink the stack.
9809 FPDiff = NumReusableBytes - NumBytes;
9810
9811 // Update the required reserved area if this is the tail call requiring the
9812 // most argument stack space.
9813 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
9814 FuncInfo->setTailCallReservedStack(-FPDiff);
9815
9816 // The stack pointer must be 16-byte aligned at all times it's used for a
9817 // memory operation, which in practice means at *all* times and in
9818 // particular across call boundaries. Therefore our own arguments started at
9819 // a 16-byte aligned SP and the delta applied for the tail call should
9820 // satisfy the same constraint.
9821 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
9822 }
9823
9824 auto DescribeCallsite =
9825 [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
9826 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
9827 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9828 R << ore::NV("Callee", ES->getSymbol());
9829 else if (CLI.CB && CLI.CB->getCalledFunction())
9830 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
9831 else
9832 R << "unknown callee";
9833 R << "'";
9834 return R;
9835 };
9836
9837 bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
9838 bool RequiresSaveAllZA =
9839 !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState();
9840 if (RequiresLazySave) {
9841 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9842 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
9843 TPIDR2.FrameIndex,
9845 Chain = DAG.getNode(
9846 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
9847 DAG.getTargetConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9848 TPIDR2ObjAddr);
9849 OptimizationRemarkEmitter ORE(&MF.getFunction());
9850 ORE.emit([&]() {
9851 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9852 CLI.CB)
9853 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9854 &MF.getFunction());
9855 return DescribeCallsite(R) << " sets up a lazy save for ZA";
9856 });
9857 } else if (RequiresSaveAllZA) {
9858 assert(!CallAttrs.callee().hasSharedZAInterface() &&
9859 "Cannot share state that may not exist");
9860 Chain = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,
9861 /*IsSave=*/true);
9862 }
9863
9864 bool RequiresSMChange = CallAttrs.requiresSMChange();
9865 if (RequiresSMChange) {
9866 OptimizationRemarkEmitter ORE(&MF.getFunction());
9867 ORE.emit([&]() {
9868 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9869 CLI.CB)
9870 : OptimizationRemarkAnalysis("sme", "SMETransition",
9871 &MF.getFunction());
9872 DescribeCallsite(R) << " requires a streaming mode transition";
9873 return R;
9874 });
9875 }
9876
9877 SDValue ZTFrameIdx;
9878 MachineFrameInfo &MFI = MF.getFrameInfo();
9879 bool ShouldPreserveZT0 =
9880 !UseNewSMEABILowering && CallAttrs.requiresPreservingZT0();
9881
9882 // If the caller has ZT0 state which will not be preserved by the callee,
9883 // spill ZT0 before the call.
9884 if (ShouldPreserveZT0) {
9885 ZTFrameIdx = getZT0FrameIndex(MFI, *FuncInfo, DAG);
9886
9887 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
9888 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9889 }
9890
9891 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
9892 // PSTATE.ZA before the call if there is no lazy-save active.
9893 bool DisableZA =
9894 !UseNewSMEABILowering && CallAttrs.requiresDisablingZABeforeCall();
9895 assert((!DisableZA || !RequiresLazySave) &&
9896 "Lazy-save should have PSTATE.SM=1 on entry to the function");
9897
9898 if (DisableZA)
9899 Chain = DAG.getNode(
9900 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
9901 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
9902
9903 // Adjust the stack pointer for the new arguments... and mark ZA uses.
9904 // These operations are automatically eliminated by the prolog/epilog pass
9905 assert((!IsSibCall || !ZAMarkerNode) && "ZA markers require CALLSEQ_START");
9906 if (!IsSibCall) {
9907 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
9908 if (ZAMarkerNode) {
9909 // Note: We need the CALLSEQ_START to glue the ZAMarkerNode to, simply
9910 // using a chain can result in incorrect scheduling. The markers refer to
9911 // the position just before the CALLSEQ_START (though occur after as
9912 // CALLSEQ_START lacks in-glue).
9913 Chain =
9914 DAG.getNode(*ZAMarkerNode, DL, DAG.getVTList(MVT::Other, MVT::Glue),
9915 {Chain, Chain.getValue(1)});
9916 }
9917 }
9918
9919 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
9921
9923 SmallSet<unsigned, 8> RegsUsed;
9924 SmallVector<SDValue, 8> MemOpChains;
9925 auto PtrVT = getPointerTy(DAG.getDataLayout());
9926
9927 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
9928 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
9929 for (const auto &F : Forwards) {
9930 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
9931 RegsToPass.emplace_back(F.PReg, Val);
9932 }
9933 }
9934
9935 // Walk the register/memloc assignments, inserting copies/loads.
9936 unsigned ExtraArgLocs = 0;
9937 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
9938 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
9939 SDValue Arg = OutVals[i];
9940 ISD::ArgFlagsTy Flags = Outs[i].Flags;
9941
9942 // Promote the value if needed.
9943 switch (VA.getLocInfo()) {
9944 default:
9945 llvm_unreachable("Unknown loc info!");
9946 case CCValAssign::Full:
9947 break;
9948 case CCValAssign::SExt:
9949 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
9950 break;
9951 case CCValAssign::ZExt:
9952 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9953 break;
9954 case CCValAssign::AExt:
9955 if (Outs[i].ArgVT == MVT::i1) {
9956 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
9957 //
9958 // Check if we actually have to do this, because the value may
9959 // already be zero-extended.
9960 //
9961 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
9962 // and rely on DAGCombiner to fold this, because the following
9963 // (anyext i32) is combined with (zext i8) in DAG.getNode:
9964 //
9965 // (ext (zext x)) -> (zext x)
9966 //
9967 // This will give us (zext i32), which we cannot remove, so
9968 // try to check this beforehand.
9969 if (!checkZExtBool(Arg, DAG)) {
9970 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9971 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
9972 }
9973 }
9974 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9975 break;
9977 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9978 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9979 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9980 DAG.getConstant(32, DL, VA.getLocVT()));
9981 break;
9982 case CCValAssign::BCvt:
9983 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
9984 break;
9985 case CCValAssign::Trunc:
9986 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9987 break;
9988 case CCValAssign::FPExt:
9989 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
9990 break;
9992 bool isScalable = VA.getValVT().isScalableVT();
9993 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
9994 "Indirect arguments should be scalable on most subtargets");
9995
9996 TypeSize StoreSize = VA.getValVT().getStoreSize();
9997 TypeSize PartSize = StoreSize;
9998 unsigned NumParts = 1;
9999 if (Outs[i].Flags.isInConsecutiveRegs()) {
10000 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
10001 ++NumParts;
10002 StoreSize *= NumParts;
10003 }
10004
10005 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
10006 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
10007 MachineFrameInfo &MFI = MF.getFrameInfo();
10008 int FI =
10009 MFI.CreateStackObject(StoreSize.getKnownMinValue(), Alignment, false);
10010 if (isScalable) {
10011 bool IsPred = VA.getValVT() == MVT::aarch64svcount ||
10012 VA.getValVT().getVectorElementType() == MVT::i1;
10015 }
10016
10017 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
10018 SDValue Ptr = DAG.getFrameIndex(
10020 SDValue SpillSlot = Ptr;
10021
10022 // Ensure we generate all stores for each tuple part, whilst updating the
10023 // pointer after each store correctly using vscale.
10024 while (NumParts) {
10025 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
10026 MemOpChains.push_back(Store);
10027
10028 NumParts--;
10029 if (NumParts > 0) {
10030 SDValue BytesIncrement =
10031 DAG.getTypeSize(DL, Ptr.getValueType(), PartSize);
10032 MPI = MachinePointerInfo(MPI.getAddrSpace());
10033 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
10034 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
10035 ExtraArgLocs++;
10036 i++;
10037 }
10038 }
10039
10040 Arg = SpillSlot;
10041 break;
10042 }
10043
10044 if (VA.isRegLoc()) {
10045 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
10046 Outs[0].VT == MVT::i64) {
10047 assert(VA.getLocVT() == MVT::i64 &&
10048 "unexpected calling convention register assignment");
10049 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
10050 "unexpected use of 'returned'");
10051 IsThisReturn = true;
10052 }
10053 if (RegsUsed.count(VA.getLocReg())) {
10054 // If this register has already been used then we're trying to pack
10055 // parts of an [N x i32] into an X-register. The extension type will
10056 // take care of putting the two halves in the right place but we have to
10057 // combine them.
10058 SDValue &Bits =
10059 llvm::find_if(RegsToPass,
10060 [=](const std::pair<unsigned, SDValue> &Elt) {
10061 return Elt.first == VA.getLocReg();
10062 })
10063 ->second;
10064 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
10065 // Call site info is used for function's parameter entry value
10066 // tracking. For now we track only simple cases when parameter
10067 // is transferred through whole register.
10069 [&VA](MachineFunction::ArgRegPair ArgReg) {
10070 return ArgReg.Reg == VA.getLocReg();
10071 });
10072 } else {
10073 // Add an extra level of indirection for streaming mode changes by
10074 // using a pseudo copy node that cannot be rematerialised between a
10075 // smstart/smstop and the call by the simple register coalescer.
10076 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
10077 Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
10078 DAG.getVTList(Arg.getValueType(), MVT::Glue), Arg);
10079 RegsToPass.emplace_back(VA.getLocReg(), Arg);
10080 RegsUsed.insert(VA.getLocReg());
10081 const TargetOptions &Options = DAG.getTarget().Options;
10082 if (Options.EmitCallSiteInfo)
10083 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
10084 }
10085 } else {
10086 assert(VA.isMemLoc());
10087
10088 SDValue DstAddr;
10089 MachinePointerInfo DstInfo;
10090
10091 // FIXME: This works on big-endian for composite byvals, which are the
10092 // common case. It should also work for fundamental types too.
10093 uint32_t BEAlign = 0;
10094 unsigned OpSize;
10095 if (VA.getLocInfo() == CCValAssign::Indirect ||
10097 OpSize = VA.getLocVT().getFixedSizeInBits();
10098 else
10099 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
10100 : VA.getValVT().getSizeInBits();
10101 OpSize = (OpSize + 7) / 8;
10102 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
10103 !Flags.isInConsecutiveRegs()) {
10104 if (OpSize < 8)
10105 BEAlign = 8 - OpSize;
10106 }
10107 unsigned LocMemOffset = VA.getLocMemOffset();
10108 int32_t Offset = LocMemOffset + BEAlign;
10109
10110 if (IsTailCall) {
10111 // When the frame pointer is perfectly aligned for the tail call and the
10112 // same stack argument is passed down intact, we can reuse it.
10113 if (!FPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags, Offset))
10114 continue;
10115
10116 Offset = Offset + FPDiff;
10117 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
10118
10119 DstAddr = DAG.getFrameIndex(FI, PtrVT);
10120 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
10121
10122 // Make sure any stack arguments overlapping with where we're storing
10123 // are loaded before this eventual operation. Otherwise they'll be
10124 // clobbered.
10125 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
10126 } else {
10127 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
10128
10129 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
10130 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
10131 }
10132
10133 if (Outs[i].Flags.isByVal()) {
10134 SDValue SizeNode =
10135 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
10136 SDValue Cpy = DAG.getMemcpy(
10137 Chain, DL, DstAddr, Arg, SizeNode,
10138 Outs[i].Flags.getNonZeroByValAlign(),
10139 /*isVol = */ false, /*AlwaysInline = */ false,
10140 /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
10141
10142 MemOpChains.push_back(Cpy);
10143 } else {
10144 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
10145 // promoted to a legal register type i32, we should truncate Arg back to
10146 // i1/i8/i16.
10147 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
10148 VA.getValVT() == MVT::i16)
10149 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
10150
10151 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
10152 MemOpChains.push_back(Store);
10153 }
10154 }
10155 }
10156
10157 if (IsVarArg && Subtarget->isWindowsArm64EC() &&
10158 !(CLI.CB && CLI.CB->isMustTailCall())) {
10159 SDValue ParamPtr = StackPtr;
10160 if (IsTailCall) {
10161 // Create a dummy object at the top of the stack that can be used to get
10162 // the SP after the epilogue
10163 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
10164 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
10165 }
10166
10167 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
10168 // describing the argument list. x4 contains the address of the
10169 // first stack parameter. x5 contains the size in bytes of all parameters
10170 // passed on the stack.
10171 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
10172 RegsToPass.emplace_back(AArch64::X5,
10173 DAG.getConstant(NumBytes, DL, MVT::i64));
10174 }
10175
10176 if (!MemOpChains.empty())
10177 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
10178
10179 SDValue InGlue;
10180 if (RequiresSMChange) {
10181 bool InsertVectorLengthCheck =
10183 Chain = changeStreamingMode(
10184 DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
10185 getSMToggleCondition(CallAttrs), InsertVectorLengthCheck);
10186 InGlue = Chain.getValue(1);
10187 }
10188
10189 // Build a sequence of copy-to-reg nodes chained together with token chain
10190 // and flag operands which copy the outgoing args into the appropriate regs.
10191 for (auto &RegToPass : RegsToPass) {
10192 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
10193 RegToPass.second, InGlue);
10194 InGlue = Chain.getValue(1);
10195 }
10196
10197 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
10198 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
10199 // node so that legalize doesn't hack it.
10200 const GlobalValue *CalledGlobal = nullptr;
10201 unsigned OpFlags = 0;
10202 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
10203 CalledGlobal = G->getGlobal();
10204 OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
10206 if (OpFlags & AArch64II::MO_GOT) {
10207 Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
10208 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
10209 } else {
10210 const GlobalValue *GV = G->getGlobal();
10211 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
10212 }
10213 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
10214 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
10215 Subtarget->isTargetMachO()) ||
10217 const char *Sym = S->getSymbol();
10218 if (UseGot) {
10220 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
10221 } else {
10222 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
10223 }
10224 }
10225
10226 // We don't usually want to end the call-sequence here because we would tidy
10227 // the frame up *after* the call, however in the ABI-changing tail-call case
10228 // we've carefully laid out the parameters so that when sp is reset they'll be
10229 // in the correct location.
10230 if (IsTailCall && !IsSibCall) {
10231 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
10232 InGlue = Chain.getValue(1);
10233 }
10234
10235 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
10236
10237 std::vector<SDValue> Ops;
10238 Ops.push_back(Chain);
10239 Ops.push_back(Callee);
10240
10241 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
10242 // be expanded to the call, directly followed by a special marker sequence and
10243 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
10244 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
10245 assert(!IsTailCall &&
10246 "tail calls cannot be marked with clang.arc.attachedcall");
10247 Opc = AArch64ISD::CALL_RVMARKER;
10248
10249 // Add a target global address for the retainRV/claimRV runtime function
10250 // just before the call target.
10251 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
10252 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
10253 Ops.insert(Ops.begin() + 1, GA);
10254
10255 // We may or may not need to emit both the marker and the retain/claim call.
10256 // Tell the pseudo expansion using an additional boolean op.
10257 bool ShouldEmitMarker = objcarc::attachedCallOpBundleNeedsMarker(CLI.CB);
10258 SDValue DoEmitMarker =
10259 DAG.getTargetConstant(ShouldEmitMarker, DL, MVT::i32);
10260 Ops.insert(Ops.begin() + 2, DoEmitMarker);
10261 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
10262 Opc = AArch64ISD::CALL_ARM64EC_TO_X64;
10263 } else if (GuardWithBTI) {
10264 Opc = AArch64ISD::CALL_BTI;
10265 }
10266
10267 if (IsTailCall) {
10268 // Each tail call may have to adjust the stack by a different amount, so
10269 // this information must travel along with the operation for eventual
10270 // consumption by emitEpilogue.
10271 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
10272 }
10273
10274 if (CLI.PAI) {
10275 const uint64_t Key = CLI.PAI->Key;
10277 "Invalid auth call key");
10278
10279 // Split the discriminator into address/integer components.
10280 SDValue AddrDisc, IntDisc;
10281 std::tie(IntDisc, AddrDisc) =
10282 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
10283
10284 if (Opc == AArch64ISD::CALL_RVMARKER)
10285 Opc = AArch64ISD::AUTH_CALL_RVMARKER;
10286 else
10287 Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;
10288 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
10289 Ops.push_back(IntDisc);
10290 Ops.push_back(AddrDisc);
10291 }
10292
10293 // Add argument registers to the end of the list so that they are known live
10294 // into the call.
10295 for (auto &RegToPass : RegsToPass)
10296 Ops.push_back(DAG.getRegister(RegToPass.first,
10297 RegToPass.second.getValueType()));
10298
10299 // Add a register mask operand representing the call-preserved registers.
10300 const uint32_t *Mask;
10301 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10302 if (IsThisReturn) {
10303 // For 'this' returns, use the X0-preserving mask if applicable
10304 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
10305 if (!Mask) {
10306 IsThisReturn = false;
10307 Mask = TRI->getCallPreservedMask(MF, CallConv);
10308 }
10309 } else
10310 Mask = TRI->getCallPreservedMask(MF, CallConv);
10311
10312 if (Subtarget->hasCustomCallingConv())
10313 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
10314
10315 if (TRI->isAnyArgRegReserved(MF))
10316 TRI->emitReservedArgRegCallError(MF);
10317
10318 assert(Mask && "Missing call preserved mask for calling convention");
10319 Ops.push_back(DAG.getRegisterMask(Mask));
10320
10321 if (InGlue.getNode())
10322 Ops.push_back(InGlue);
10323
10324 if (CLI.DeactivationSymbol)
10325 Ops.push_back(DAG.getDeactivationSymbol(CLI.DeactivationSymbol));
10326
10327 // If we're doing a tall call, use a TC_RETURN here rather than an
10328 // actual call instruction.
10329 if (IsTailCall) {
10331 SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);
10332 if (IsCFICall)
10333 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
10334
10335 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
10336 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
10337 if (CalledGlobal &&
10338 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
10339 DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
10340 return Ret;
10341 }
10342
10343 // Returns a chain and a flag for retval copy to use.
10344 Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);
10345 if (IsCFICall)
10346 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
10347
10348 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
10349 InGlue = Chain.getValue(1);
10350 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
10351 if (CalledGlobal &&
10352 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
10353 DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
10354
10355 uint64_t CalleePopBytes =
10356 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
10357
10358 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
10359 InGlue = Chain.getValue(1);
10360
10361 // Handle result values, copying them out of physregs into vregs that we
10362 // return.
10363 SDValue Result = LowerCallResult(
10364 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
10365 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
10366
10367 if (!Ins.empty())
10368 InGlue = Result.getValue(Result->getNumValues() - 1);
10369
10370 if (RequiresSMChange) {
10372 DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,
10373 getSMToggleCondition(CallAttrs));
10374 }
10375
10376 if (!UseNewSMEABILowering &&
10377 (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall()))
10378 // Unconditionally resume ZA.
10379 Result = DAG.getNode(
10380 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result,
10381 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
10382
10383 if (ShouldPreserveZT0)
10384 Result =
10385 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
10386 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
10387
10388 if (RequiresLazySave) {
10389 Result = emitRestoreZALazySave(Result, DL, *this, *TRI, *FuncInfo, DAG);
10390 } else if (RequiresSaveAllZA) {
10391 Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
10392 /*IsSave=*/false);
10393 }
10394
10395 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0 ||
10396 RequiresSaveAllZA) {
10397 for (unsigned I = 0; I < InVals.size(); ++I) {
10398 // The smstart/smstop is chained as part of the call, but when the
10399 // resulting chain is discarded (which happens when the call is not part
10400 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
10401 // smstart/smstop is chained to the result value. We can do that by doing
10402 // a vreg -> vreg copy.
10405 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
10406 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
10407 InVals[I].getValueType());
10408 }
10409 }
10410
10411 if (CallConv == CallingConv::PreserveNone) {
10412 for (const ISD::OutputArg &O : Outs) {
10413 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
10414 O.Flags.isSwiftAsync()) {
10415 MachineFunction &MF = DAG.getMachineFunction();
10416 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10417 MF.getFunction(),
10418 "Swift attributes can't be used with preserve_none",
10419 DL.getDebugLoc()));
10420 break;
10421 }
10422 }
10423 }
10424
10425 return Result;
10426}
10427
10428bool AArch64TargetLowering::CanLowerReturn(
10429 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
10431 const Type *RetTy) const {
10432 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10434 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
10435 return CCInfo.CheckReturn(Outs, RetCC);
10436}
10437
10438SDValue
10439AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
10440 bool isVarArg,
10442 const SmallVectorImpl<SDValue> &OutVals,
10443 const SDLoc &DL, SelectionDAG &DAG) const {
10444 auto &MF = DAG.getMachineFunction();
10445 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10446
10447 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10449 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
10450 CCInfo.AnalyzeReturn(Outs, RetCC);
10451
10452 // Copy the result values into the output registers.
10453 SDValue Glue;
10455 SmallSet<unsigned, 4> RegsUsed;
10456 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
10457 ++i, ++realRVLocIdx) {
10458 CCValAssign &VA = RVLocs[i];
10459 assert(VA.isRegLoc() && "Can only return in registers!");
10460 SDValue Arg = OutVals[realRVLocIdx];
10461
10462 switch (VA.getLocInfo()) {
10463 default:
10464 llvm_unreachable("Unknown loc info!");
10465 case CCValAssign::Full:
10466 if (Outs[i].ArgVT == MVT::i1) {
10467 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
10468 // value. This is strictly redundant on Darwin (which uses "zeroext
10469 // i1"), but will be optimised out before ISel.
10470 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
10471 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
10472 }
10473 break;
10474 case CCValAssign::BCvt:
10475 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
10476 break;
10477 case CCValAssign::AExt:
10478 case CCValAssign::ZExt:
10479 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10480 break;
10482 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
10483 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10484 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
10485 DAG.getConstant(32, DL, VA.getLocVT()));
10486 break;
10487 }
10488
10489 if (RegsUsed.count(VA.getLocReg())) {
10490 SDValue &Bits =
10491 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
10492 return Elt.first == VA.getLocReg();
10493 })->second;
10494 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
10495 } else {
10496 RetVals.emplace_back(VA.getLocReg(), Arg);
10497 RegsUsed.insert(VA.getLocReg());
10498 }
10499 }
10500
10501 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10502
10503 // Emit SMSTOP before returning from a locally streaming function
10504 SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
10505 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
10506 if (FuncAttrs.hasStreamingCompatibleInterface())
10507 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10508 /*Glue*/ SDValue(),
10510 else
10511 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10512 /*Glue*/ SDValue(), AArch64SME::Always);
10513 Glue = Chain.getValue(1);
10514 }
10515
10516 SmallVector<SDValue, 4> RetOps(1, Chain);
10517 for (auto &RetVal : RetVals) {
10518 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
10519 isPassedInFPR(RetVal.second.getValueType()))
10520 RetVal.second =
10521 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
10522 DAG.getVTList(RetVal.second.getValueType(), MVT::Glue),
10523 RetVal.second);
10524 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
10525 Glue = Chain.getValue(1);
10526 RetOps.push_back(
10527 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
10528 }
10529
10530 // Windows AArch64 ABIs require that for returning structs by value we copy
10531 // the sret argument into X0 for the return.
10532 // We saved the argument into a virtual register in the entry block,
10533 // so now we copy the value out and into X0.
10534 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
10535 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
10537
10538 unsigned RetValReg = AArch64::X0;
10539 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
10540 RetValReg = AArch64::X8;
10541 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
10542 Glue = Chain.getValue(1);
10543
10544 RetOps.push_back(
10545 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
10546 }
10547
10548 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
10549 if (I) {
10550 for (; *I; ++I) {
10551 if (AArch64::GPR64RegClass.contains(*I))
10552 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
10553 else if (AArch64::FPR64RegClass.contains(*I))
10554 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
10555 else
10556 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
10557 }
10558 }
10559
10560 RetOps[0] = Chain; // Update chain.
10561
10562 // Add the glue if we have it.
10563 if (Glue.getNode())
10564 RetOps.push_back(Glue);
10565
10566 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
10567 // ARM64EC entry thunks use a special return sequence: instead of a regular
10568 // "ret" instruction, they need to explicitly call the emulator.
10569 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10570 SDValue Arm64ECRetDest =
10571 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
10572 Arm64ECRetDest =
10573 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
10574 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
10575 MachinePointerInfo());
10576 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
10577 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
10578 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
10579 }
10580
10581 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
10582}
10583
10584//===----------------------------------------------------------------------===//
10585// Other Lowering Code
10586//===----------------------------------------------------------------------===//
10587
10588SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
10589 SelectionDAG &DAG,
10590 unsigned Flag) const {
10591 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
10592 N->getOffset(), Flag);
10593}
10594
10595SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
10596 SelectionDAG &DAG,
10597 unsigned Flag) const {
10598 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
10599}
10600
10601SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
10602 SelectionDAG &DAG,
10603 unsigned Flag) const {
10604 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
10605 N->getOffset(), Flag);
10606}
10607
10608SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
10609 SelectionDAG &DAG,
10610 unsigned Flag) const {
10611 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
10612}
10613
10614SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
10615 SelectionDAG &DAG,
10616 unsigned Flag) const {
10617 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
10618}
10619
10620// (loadGOT sym)
10621template <class NodeTy>
10622SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
10623 unsigned Flags) const {
10624 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
10625 SDLoc DL(N);
10626 EVT Ty = getPointerTy(DAG.getDataLayout());
10627 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
10628 // FIXME: Once remat is capable of dealing with instructions with register
10629 // operands, expand this into two nodes instead of using a wrapper node.
10630 if (DAG.getMachineFunction()
10631 .getInfo<AArch64FunctionInfo>()
10632 ->hasELFSignedGOT())
10633 return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
10634 0);
10635 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
10636}
10637
10638// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
10639template <class NodeTy>
10640SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
10641 unsigned Flags) const {
10642 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
10643 SDLoc DL(N);
10644 EVT Ty = getPointerTy(DAG.getDataLayout());
10645 const unsigned char MO_NC = AArch64II::MO_NC;
10646 return DAG.getNode(
10647 AArch64ISD::WrapperLarge, DL, Ty,
10648 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
10649 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
10650 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
10651 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
10652}
10653
10654// (addlow (adrp %hi(sym)) %lo(sym))
10655template <class NodeTy>
10656SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
10657 unsigned Flags) const {
10658 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
10659 SDLoc DL(N);
10660 EVT Ty = getPointerTy(DAG.getDataLayout());
10661 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
10662 SDValue Lo = getTargetNode(N, Ty, DAG,
10664 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
10665 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
10666}
10667
10668// (adr sym)
10669template <class NodeTy>
10670SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
10671 unsigned Flags) const {
10672 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
10673 SDLoc DL(N);
10674 EVT Ty = getPointerTy(DAG.getDataLayout());
10675 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
10676 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
10677}
10678
10679SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
10680 SelectionDAG &DAG) const {
10681 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
10682 const GlobalValue *GV = GN->getGlobal();
10683 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
10684
10685 if (OpFlags != AArch64II::MO_NO_FLAG)
10687 "unexpected offset in global node");
10688
10689 // This also catches the large code model case for Darwin, and tiny code
10690 // model with got relocations.
10691 if ((OpFlags & AArch64II::MO_GOT) != 0) {
10692 return getGOT(GN, DAG, OpFlags);
10693 }
10694
10698 Result = getAddrLarge(GN, DAG, OpFlags);
10699 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
10700 Result = getAddrTiny(GN, DAG, OpFlags);
10701 } else {
10702 Result = getAddr(GN, DAG, OpFlags);
10703 }
10704 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10705 SDLoc DL(GN);
10707 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
10709 return Result;
10710}
10711
10712/// Convert a TLS address reference into the correct sequence of loads
10713/// and calls to compute the variable's address (for Darwin, currently) and
10714/// return an SDValue containing the final node.
10715
10716/// Darwin only has one TLS scheme which must be capable of dealing with the
10717/// fully general situation, in the worst case. This means:
10718/// + "extern __thread" declaration.
10719/// + Defined in a possibly unknown dynamic library.
10720///
10721/// The general system is that each __thread variable has a [3 x i64] descriptor
10722/// which contains information used by the runtime to calculate the address. The
10723/// only part of this the compiler needs to know about is the first xword, which
10724/// contains a function pointer that must be called with the address of the
10725/// entire descriptor in "x0".
10726///
10727/// Since this descriptor may be in a different unit, in general even the
10728/// descriptor must be accessed via an indirect load. The "ideal" code sequence
10729/// is:
10730/// adrp x0, _var@TLVPPAGE
10731/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
10732/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
10733/// ; the function pointer
10734/// blr x1 ; Uses descriptor address in x0
10735/// ; Address of _var is now in x0.
10736///
10737/// If the address of _var's descriptor *is* known to the linker, then it can
10738/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
10739/// a slight efficiency gain.
10740SDValue
10741AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
10742 SelectionDAG &DAG) const {
10743 assert(Subtarget->isTargetDarwin() &&
10744 "This function expects a Darwin target");
10745
10746 SDLoc DL(Op);
10747 MVT PtrVT = getPointerTy(DAG.getDataLayout());
10748 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10749 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
10750
10751 SDValue TLVPAddr =
10752 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10753 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
10754
10755 // The first entry in the descriptor is a function pointer that we must call
10756 // to obtain the address of the variable.
10757 SDValue Chain = DAG.getEntryNode();
10758 SDValue FuncTLVGet = DAG.getLoad(
10759 PtrMemVT, DL, Chain, DescAddr,
10761 Align(PtrMemVT.getSizeInBits() / 8),
10763 Chain = FuncTLVGet.getValue(1);
10764
10765 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
10766 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
10767
10768 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10769 MFI.setAdjustsStack(true);
10770
10771 // TLS calls preserve all registers except those that absolutely must be
10772 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
10773 // silly).
10774 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10775 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
10776 if (Subtarget->hasCustomCallingConv())
10777 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10778
10779 // Finally, we can make the call. This is just a degenerate version of a
10780 // normal AArch64 call node: x0 takes the address of the descriptor, and
10781 // returns the address of the variable in this thread.
10782 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
10783
10784 unsigned Opcode = AArch64ISD::CALL;
10786 Ops.push_back(Chain);
10787 Ops.push_back(FuncTLVGet);
10788
10789 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
10790 if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
10791 Opcode = AArch64ISD::AUTH_CALL;
10792 Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
10793 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
10794 Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
10795 }
10796
10797 Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
10798 Ops.push_back(DAG.getRegisterMask(Mask));
10799 Ops.push_back(Chain.getValue(1));
10800 Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
10801 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
10802}
10803
10804/// Convert a thread-local variable reference into a sequence of instructions to
10805/// compute the variable's address for the local exec TLS model of ELF targets.
10806/// The sequence depends on the maximum TLS area size.
10807SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
10808 SDValue ThreadBase,
10809 const SDLoc &DL,
10810 SelectionDAG &DAG) const {
10811 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10812 SDValue TPOff, Addr;
10813
10814 switch (DAG.getTarget().Options.TLSSize) {
10815 default:
10816 llvm_unreachable("Unexpected TLS size");
10817
10818 case 12: {
10819 // mrs x0, TPIDR_EL0
10820 // add x0, x0, :tprel_lo12:a
10822 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10823 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10824 Var,
10825 DAG.getTargetConstant(0, DL, MVT::i32)),
10826 0);
10827 }
10828
10829 case 24: {
10830 // mrs x0, TPIDR_EL0
10831 // add x0, x0, :tprel_hi12:a
10832 // add x0, x0, :tprel_lo12_nc:a
10833 SDValue HiVar = DAG.getTargetGlobalAddress(
10834 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10835 SDValue LoVar = DAG.getTargetGlobalAddress(
10836 GV, DL, PtrVT, 0,
10838 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10839 HiVar,
10840 DAG.getTargetConstant(0, DL, MVT::i32)),
10841 0);
10842 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
10843 LoVar,
10844 DAG.getTargetConstant(0, DL, MVT::i32)),
10845 0);
10846 }
10847
10848 case 32: {
10849 // mrs x1, TPIDR_EL0
10850 // movz x0, #:tprel_g1:a
10851 // movk x0, #:tprel_g0_nc:a
10852 // add x0, x1, x0
10853 SDValue HiVar = DAG.getTargetGlobalAddress(
10854 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
10855 SDValue LoVar = DAG.getTargetGlobalAddress(
10856 GV, DL, PtrVT, 0,
10858 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10859 DAG.getTargetConstant(16, DL, MVT::i32)),
10860 0);
10861 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10862 DAG.getTargetConstant(0, DL, MVT::i32)),
10863 0);
10864 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10865 }
10866
10867 case 48: {
10868 // mrs x1, TPIDR_EL0
10869 // movz x0, #:tprel_g2:a
10870 // movk x0, #:tprel_g1_nc:a
10871 // movk x0, #:tprel_g0_nc:a
10872 // add x0, x1, x0
10873 SDValue HiVar = DAG.getTargetGlobalAddress(
10874 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
10875 SDValue MiVar = DAG.getTargetGlobalAddress(
10876 GV, DL, PtrVT, 0,
10878 SDValue LoVar = DAG.getTargetGlobalAddress(
10879 GV, DL, PtrVT, 0,
10881 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10882 DAG.getTargetConstant(32, DL, MVT::i32)),
10883 0);
10884 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
10885 DAG.getTargetConstant(16, DL, MVT::i32)),
10886 0);
10887 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10888 DAG.getTargetConstant(0, DL, MVT::i32)),
10889 0);
10890 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10891 }
10892 }
10893}
10894
10895/// When accessing thread-local variables under either the general-dynamic or
10896/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10897/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10898/// is a function pointer to carry out the resolution.
10899///
10900/// The sequence is:
10901/// adrp x0, :tlsdesc:var
10902/// ldr x1, [x0, #:tlsdesc_lo12:var]
10903/// add x0, x0, #:tlsdesc_lo12:var
10904/// .tlsdesccall var
10905/// blr x1
10906/// (TPIDR_EL0 offset now in x0)
10907///
10908/// The above sequence must be produced unscheduled, to enable the linker to
10909/// optimize/relax this sequence.
10910/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10911/// above sequence, and expanded really late in the compilation flow, to ensure
10912/// the sequence is produced as per above.
10913SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10914 const SDLoc &DL,
10915 SelectionDAG &DAG) const {
10916 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10917 auto &MF = DAG.getMachineFunction();
10918 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10919
10920 SDValue Glue;
10921 SDValue Chain = DAG.getEntryNode();
10922 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10923
10924 SMECallAttrs TLSCallAttrs(FuncInfo->getSMEFnAttrs(), {}, SMEAttrs::Normal);
10925 bool RequiresSMChange = TLSCallAttrs.requiresSMChange();
10926
10927 auto ChainAndGlue = [](SDValue Chain) -> std::pair<SDValue, SDValue> {
10928 return {Chain, Chain.getValue(1)};
10929 };
10930
10931 if (RequiresSMChange)
10932 std::tie(Chain, Glue) =
10933 ChainAndGlue(changeStreamingMode(DAG, DL, /*Enable=*/false, Chain, Glue,
10934 getSMToggleCondition(TLSCallAttrs)));
10935
10936 unsigned Opcode =
10937 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
10938 ? AArch64ISD::TLSDESC_AUTH_CALLSEQ
10939 : AArch64ISD::TLSDESC_CALLSEQ;
10940 SDValue Ops[] = {Chain, SymAddr, Glue};
10941 std::tie(Chain, Glue) = ChainAndGlue(DAG.getNode(
10942 Opcode, DL, NodeTys, Glue ? ArrayRef(Ops) : ArrayRef(Ops).drop_back()));
10943
10944 if (TLSCallAttrs.requiresLazySave())
10945 std::tie(Chain, Glue) = ChainAndGlue(DAG.getNode(
10946 AArch64ISD::REQUIRES_ZA_SAVE, DL, NodeTys, {Chain, Chain.getValue(1)}));
10947
10948 if (RequiresSMChange)
10949 std::tie(Chain, Glue) =
10950 ChainAndGlue(changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
10951 getSMToggleCondition(TLSCallAttrs)));
10952
10953 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
10954}
10955
10956SDValue
10957AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
10958 SelectionDAG &DAG) const {
10959 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
10960
10961 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10962 AArch64FunctionInfo *MFI =
10963 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10964
10968
10970 if (Model == TLSModel::LocalDynamic)
10972 }
10973
10975 Model != TLSModel::LocalExec)
10976 report_fatal_error("ELF TLS only supported in small memory model or "
10977 "in local exec TLS model");
10978 // Different choices can be made for the maximum size of the TLS area for a
10979 // module. For the small address model, the default TLS size is 16MiB and the
10980 // maximum TLS size is 4GiB.
10981 // FIXME: add tiny and large code model support for TLS access models other
10982 // than local exec. We currently generate the same code as small for tiny,
10983 // which may be larger than needed.
10984
10985 SDValue TPOff;
10986 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10987 SDLoc DL(Op);
10988 const GlobalValue *GV = GA->getGlobal();
10989
10990 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
10991
10992 if (Model == TLSModel::LocalExec) {
10993 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
10994 } else if (Model == TLSModel::InitialExec) {
10995 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10996 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
10997 } else if (Model == TLSModel::LocalDynamic) {
10998 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
10999 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
11000 // the beginning of the module's TLS region, followed by a DTPREL offset
11001 // calculation.
11002
11003 // These accesses will need deduplicating if there's more than one.
11005
11006 // The call needs a relocation too for linker relaxation. It doesn't make
11007 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
11008 // the address.
11009 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
11011
11012 // Now we can calculate the offset from TPIDR_EL0 to this module's
11013 // thread-local area.
11014 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
11015
11016 // Now use :dtprel_whatever: operations to calculate this variable's offset
11017 // in its thread-storage area.
11018 SDValue HiVar = DAG.getTargetGlobalAddress(
11019 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
11020 SDValue LoVar = DAG.getTargetGlobalAddress(
11021 GV, DL, MVT::i64, 0,
11023
11024 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
11025 DAG.getTargetConstant(0, DL, MVT::i32)),
11026 0);
11027 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
11028 DAG.getTargetConstant(0, DL, MVT::i32)),
11029 0);
11030 } else if (Model == TLSModel::GeneralDynamic) {
11031 // The call needs a relocation too for linker relaxation. It doesn't make
11032 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
11033 // the address.
11034 SDValue SymAddr =
11035 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
11036
11037 // Finally we can make a call to calculate the offset from tpidr_el0.
11038 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
11039 } else
11040 llvm_unreachable("Unsupported ELF TLS access model");
11041
11042 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
11043}
11044
11045SDValue
11046AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
11047 SelectionDAG &DAG) const {
11048 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
11049
11050 SDValue Chain = DAG.getEntryNode();
11051 EVT PtrVT = getPointerTy(DAG.getDataLayout());
11052 SDLoc DL(Op);
11053
11054 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
11055
11056 // Load the ThreadLocalStoragePointer from the TEB
11057 // A pointer to the TLS array is located at offset 0x58 from the TEB.
11058 SDValue TLSArray =
11059 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
11060 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
11061 Chain = TLSArray.getValue(1);
11062
11063 // Load the TLS index from the C runtime;
11064 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
11065 // This also does the same as LOADgot, but using a generic i32 load,
11066 // while LOADgot only loads i64.
11067 SDValue TLSIndexHi =
11068 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
11069 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
11070 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
11071 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
11072 SDValue TLSIndex =
11073 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
11074 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
11075 Chain = TLSIndex.getValue(1);
11076
11077 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
11078 // offset into the TLSArray.
11079 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
11080 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
11081 DAG.getConstant(3, DL, PtrVT));
11082 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
11083 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
11084 MachinePointerInfo());
11085 Chain = TLS.getValue(1);
11086
11087 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
11088 const GlobalValue *GV = GA->getGlobal();
11089 SDValue TGAHi = DAG.getTargetGlobalAddress(
11090 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
11091 SDValue TGALo = DAG.getTargetGlobalAddress(
11092 GV, DL, PtrVT, 0,
11094
11095 // Add the offset from the start of the .tls section (section base).
11096 SDValue Addr =
11097 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
11098 DAG.getTargetConstant(0, DL, MVT::i32)),
11099 0);
11100 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
11101 return Addr;
11102}
11103
11104SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
11105 SelectionDAG &DAG) const {
11106 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
11107 if (DAG.getTarget().useEmulatedTLS())
11108 return LowerToTLSEmulatedModel(GA, DAG);
11109
11110 if (Subtarget->isTargetDarwin())
11111 return LowerDarwinGlobalTLSAddress(Op, DAG);
11112 if (Subtarget->isTargetELF())
11113 return LowerELFGlobalTLSAddress(Op, DAG);
11114 if (Subtarget->isTargetWindows())
11115 return LowerWindowsGlobalTLSAddress(Op, DAG);
11116
11117 llvm_unreachable("Unexpected platform trying to use TLS");
11118}
11119
11120//===----------------------------------------------------------------------===//
11121// PtrAuthGlobalAddress lowering
11122//
11123// We have 3 lowering alternatives to choose from:
11124// - MOVaddrPAC: similar to MOVaddr, with added PAC.
11125// If the GV doesn't need a GOT load (i.e., is locally defined)
11126// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
11127//
11128// - LOADgotPAC: similar to LOADgot, with added PAC.
11129// If the GV needs a GOT load, materialize the pointer using the usual
11130// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
11131// section is assumed to be read-only (for example, via relro mechanism). See
11132// LowerMOVaddrPAC.
11133//
11134// - LOADauthptrstatic: similar to LOADgot, but use a
11135// special stub slot instead of a GOT slot.
11136// Load a signed pointer for symbol 'sym' from a stub slot named
11137// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
11138// resolving. This usually lowers to adrp+ldr, but also emits an entry into
11139// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
11140//
11141// All 3 are pseudos that are expand late to longer sequences: this lets us
11142// provide integrity guarantees on the to-be-signed intermediate values.
11143//
11144// LOADauthptrstatic is undesirable because it requires a large section filled
11145// with often similarly-signed pointers, making it a good harvesting target.
11146// Thus, it's only used for ptrauth references to extern_weak to avoid null
11147// checks.
11148
11150 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
11151 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
11152 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
11153 assert(TGN->getGlobal()->hasExternalWeakLinkage());
11154
11155 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
11156 // offset alone as a pointer if the symbol wasn't available, which would
11157 // probably break null checks in users. Ptrauth complicates things further:
11158 // error out.
11159 if (TGN->getOffset() != 0)
11161 "unsupported non-zero offset in weak ptrauth global reference");
11162
11163 if (!isNullConstant(AddrDiscriminator))
11164 report_fatal_error("unsupported weak addr-div ptrauth global");
11165
11166 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
11167 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
11168 {TGA, Key, Discriminator}),
11169 0);
11170}
11171
11172SDValue
11173AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
11174 SelectionDAG &DAG) const {
11175 SDValue Ptr = Op.getOperand(0);
11176 uint64_t KeyC = Op.getConstantOperandVal(1);
11177 SDValue AddrDiscriminator = Op.getOperand(2);
11178 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
11179 EVT VT = Op.getValueType();
11180 SDLoc DL(Op);
11181
11182 if (KeyC > AArch64PACKey::LAST)
11183 report_fatal_error("key in ptrauth global out of range [0, " +
11184 Twine((int)AArch64PACKey::LAST) + "]");
11185
11186 // Blend only works if the integer discriminator is 16-bit wide.
11187 if (!isUInt<16>(DiscriminatorC))
11189 "constant discriminator in ptrauth global out of range [0, 0xffff]");
11190
11191 // Choosing between 3 lowering alternatives is target-specific.
11192 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
11193 report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
11194
11195 int64_t PtrOffsetC = 0;
11196 if (Ptr.getOpcode() == ISD::ADD) {
11197 PtrOffsetC = Ptr.getConstantOperandVal(1);
11198 Ptr = Ptr.getOperand(0);
11199 }
11200 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
11201 const GlobalValue *PtrGV = PtrN->getGlobal();
11202
11203 // Classify the reference to determine whether it needs a GOT load.
11204 const unsigned OpFlags =
11205 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
11206 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
11207 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
11208 "unsupported non-GOT op flags on ptrauth global reference");
11209
11210 // Fold any offset into the GV; our pseudos expect it there.
11211 PtrOffsetC += PtrN->getOffset();
11212 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
11213 /*TargetFlags=*/0);
11214 assert(PtrN->getTargetFlags() == 0 &&
11215 "unsupported target flags on ptrauth global");
11216
11217 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
11218 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
11219 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
11220 ? AddrDiscriminator
11221 : DAG.getRegister(AArch64::XZR, MVT::i64);
11222
11223 // No GOT load needed -> MOVaddrPAC
11224 if (!NeedsGOTLoad) {
11225 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
11226 return SDValue(
11227 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
11228 {TPtr, Key, TAddrDiscriminator, Discriminator}),
11229 0);
11230 }
11231
11232 // GOT load -> LOADgotPAC
11233 // Note that we disallow extern_weak refs to avoid null checks later.
11234 if (!PtrGV->hasExternalWeakLinkage())
11235 return SDValue(
11236 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
11237 {TPtr, Key, TAddrDiscriminator, Discriminator}),
11238 0);
11239
11240 // extern_weak ref -> LOADauthptrstatic
11242 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
11243 DAG);
11244}
11245
11246// Looks through \param Val to determine the bit that can be used to
11247// check the sign of the value. It returns the unextended value and
11248// the sign bit position.
11249std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
11250 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
11251 return {Val.getOperand(0),
11252 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
11253 1};
11254
11255 if (Val.getOpcode() == ISD::SIGN_EXTEND)
11256 return {Val.getOperand(0),
11257 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
11258
11259 return {Val, Val.getValueSizeInBits() - 1};
11260}
11261
11262SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
11263 SDValue Chain = Op.getOperand(0);
11264 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
11265 SDValue LHS = Op.getOperand(2);
11266 SDValue RHS = Op.getOperand(3);
11267 SDValue Dest = Op.getOperand(4);
11268 SDLoc DL(Op);
11269
11270 MachineFunction &MF = DAG.getMachineFunction();
11271 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
11272 // will not be produced, as they are conditional branch instructions that do
11273 // not set flags.
11274 bool ProduceNonFlagSettingCondBr =
11275 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
11276
11277 // Handle f128 first, since lowering it will result in comparing the return
11278 // value of a libcall against zero, which is just what the rest of LowerBR_CC
11279 // is expecting to deal with.
11280 if (LHS.getValueType() == MVT::f128) {
11281 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
11282
11283 // If softenSetCCOperands returned a scalar, we need to compare the result
11284 // against zero to select between true and false values.
11285 if (!RHS.getNode()) {
11286 RHS = DAG.getConstant(0, DL, LHS.getValueType());
11287 CC = ISD::SETNE;
11288 }
11289 }
11290
11291 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
11292 // instruction.
11294 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
11295 // Only lower legal XALUO ops.
11296 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
11297 return SDValue();
11298
11299 // The actual operation with overflow check.
11301 SDValue Value, Overflow;
11302 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
11303
11304 if (CC == ISD::SETNE)
11305 OFCC = getInvertedCondCode(OFCC);
11306 SDValue CCVal = getCondCode(DAG, OFCC);
11307
11308 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
11309 Overflow);
11310 }
11311
11312 if (LHS.getValueType().isInteger()) {
11313 assert((LHS.getValueType() == RHS.getValueType()) &&
11314 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11315
11316 // If the RHS of the comparison is zero, we can potentially fold this
11317 // to a specialized branch.
11318 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11319 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
11320 if (CC == ISD::SETEQ) {
11321 // See if we can use a TBZ to fold in an AND as well.
11322 // TBZ has a smaller branch displacement than CBZ. If the offset is
11323 // out of bounds, a late MI-layer pass rewrites branches.
11324 // 403.gcc is an example that hits this case.
11325 if (LHS.getOpcode() == ISD::AND &&
11326 isa<ConstantSDNode>(LHS.getOperand(1)) &&
11327 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
11328 SDValue Test = LHS.getOperand(0);
11329 uint64_t Mask = LHS.getConstantOperandVal(1);
11330 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, Test,
11331 DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
11332 Dest);
11333 }
11334
11335 return DAG.getNode(AArch64ISD::CBZ, DL, MVT::Other, Chain, LHS, Dest);
11336 } else if (CC == ISD::SETNE) {
11337 // See if we can use a TBZ to fold in an AND as well.
11338 // TBZ has a smaller branch displacement than CBZ. If the offset is
11339 // out of bounds, a late MI-layer pass rewrites branches.
11340 // 403.gcc is an example that hits this case.
11341 if (LHS.getOpcode() == ISD::AND &&
11342 isa<ConstantSDNode>(LHS.getOperand(1)) &&
11343 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
11344 SDValue Test = LHS.getOperand(0);
11345 uint64_t Mask = LHS.getConstantOperandVal(1);
11346 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, Test,
11347 DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
11348 Dest);
11349 }
11350
11351 return DAG.getNode(AArch64ISD::CBNZ, DL, MVT::Other, Chain, LHS, Dest);
11352 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
11353 // Don't combine AND since emitComparison converts the AND to an ANDS
11354 // (a.k.a. TST) and the test in the test bit and branch instruction
11355 // becomes redundant. This would also increase register pressure.
11356 uint64_t SignBitPos;
11357 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11358 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, LHS,
11359 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11360 }
11361 }
11362 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
11363 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
11364 // Don't combine AND since emitComparison converts the AND to an ANDS
11365 // (a.k.a. TST) and the test in the test bit and branch instruction
11366 // becomes redundant. This would also increase register pressure.
11367 uint64_t SignBitPos;
11368 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11369 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, LHS,
11370 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11371 }
11372
11373 // Try to emit Armv9.6 CB instructions. We prefer tb{n}z/cb{n}z due to their
11374 // larger branch displacement but do prefer CB over cmp + br.
11375 if (Subtarget->hasCMPBR() &&
11377 ProduceNonFlagSettingCondBr) {
11378 SDValue Cond =
11380 return DAG.getNode(AArch64ISD::CB, DL, MVT::Other, Chain, Cond, LHS, RHS,
11381 Dest);
11382 }
11383
11384 SDValue CCVal;
11385 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
11386 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
11387 Cmp);
11388 }
11389
11390 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
11391 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11392
11393 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11394 // clean. Some of them require two branches to implement.
11395 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11396 AArch64CC::CondCode CC1, CC2;
11397 changeFPCCToAArch64CC(CC, CC1, CC2);
11398 SDValue CC1Val = getCondCode(DAG, CC1);
11399 SDValue BR1 =
11400 DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CC1Val, Cmp);
11401 if (CC2 != AArch64CC::AL) {
11402 SDValue CC2Val = getCondCode(DAG, CC2);
11403 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, BR1, Dest, CC2Val,
11404 Cmp);
11405 }
11406
11407 return BR1;
11408}
11409
11410SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
11411 SelectionDAG &DAG) const {
11412 if (!Subtarget->isNeonAvailable() &&
11413 !Subtarget->useSVEForFixedLengthVectors())
11414 return SDValue();
11415
11416 EVT VT = Op.getValueType();
11417 EVT IntVT = VT.changeTypeToInteger();
11418 SDLoc DL(Op);
11419
11420 SDValue In1 = Op.getOperand(0);
11421 SDValue In2 = Op.getOperand(1);
11422 EVT SrcVT = In2.getValueType();
11423
11424 if (!SrcVT.bitsEq(VT))
11425 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
11426
11427 if (VT.isScalableVector())
11428 IntVT =
11430
11431 if (VT.isFixedLengthVector() &&
11432 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
11433 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
11434
11435 In1 = convertToScalableVector(DAG, ContainerVT, In1);
11436 In2 = convertToScalableVector(DAG, ContainerVT, In2);
11437
11438 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
11439 return convertFromScalableVector(DAG, VT, Res);
11440 }
11441
11442 // With SVE, but without Neon, extend the scalars to scalable vectors and use
11443 // a SVE FCOPYSIGN.
11444 if (!VT.isVector() && !Subtarget->isNeonAvailable() &&
11445 Subtarget->isSVEorStreamingSVEAvailable()) {
11446 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64 && VT != MVT::bf16)
11447 return SDValue();
11448 EVT SVT = getPackedSVEVectorVT(VT);
11449
11450 SDValue Ins1 =
11451 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In1,
11452 DAG.getConstant(0, DL, MVT::i64));
11453 SDValue Ins2 =
11454 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In2,
11455 DAG.getConstant(0, DL, MVT::i64));
11456 SDValue FCS = DAG.getNode(ISD::FCOPYSIGN, DL, SVT, Ins1, Ins2);
11457 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, FCS,
11458 DAG.getConstant(0, DL, MVT::i64));
11459 }
11460
11461 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
11462 if (VT.isScalableVector())
11463 return getSVESafeBitCast(VT, Op, DAG);
11464
11465 return DAG.getBitcast(VT, Op);
11466 };
11467
11468 SDValue VecVal1, VecVal2;
11469 EVT VecVT;
11470 auto SetVecVal = [&](int Idx = -1) {
11471 if (!VT.isVector()) {
11472 VecVal1 =
11473 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
11474 VecVal2 =
11475 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
11476 } else {
11477 VecVal1 = BitCast(VecVT, In1, DAG);
11478 VecVal2 = BitCast(VecVT, In2, DAG);
11479 }
11480 };
11481 if (VT.isVector()) {
11482 VecVT = IntVT;
11483 SetVecVal();
11484 } else if (VT == MVT::f64) {
11485 VecVT = MVT::v2i64;
11486 SetVecVal(AArch64::dsub);
11487 } else if (VT == MVT::f32) {
11488 VecVT = MVT::v4i32;
11489 SetVecVal(AArch64::ssub);
11490 } else if (VT == MVT::f16 || VT == MVT::bf16) {
11491 VecVT = MVT::v8i16;
11492 SetVecVal(AArch64::hsub);
11493 } else {
11494 llvm_unreachable("Invalid type for copysign!");
11495 }
11496
11497 unsigned BitWidth = In1.getScalarValueSizeInBits();
11498 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
11499
11500 // We want to materialize a mask with every bit but the high bit set, but the
11501 // AdvSIMD immediate moves cannot materialize that in a single instruction for
11502 // 64-bit elements. Instead, materialize all bits set and then negate that.
11503 if (VT == MVT::f64 || VT == MVT::v2f64) {
11504 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
11505 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
11506 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
11507 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
11508 }
11509
11510 SDValue BSP =
11511 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
11512 if (VT == MVT::f16 || VT == MVT::bf16)
11513 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
11514 if (VT == MVT::f32)
11515 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
11516 if (VT == MVT::f64)
11517 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
11518
11519 return BitCast(VT, BSP, DAG);
11520}
11521
11522SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
11523 SelectionDAG &DAG) const {
11525 Attribute::NoImplicitFloat))
11526 return SDValue();
11527
11528 EVT VT = Op.getValueType();
11529 if (VT.isScalableVector() ||
11530 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
11531 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
11532
11533 bool IsParity = Op.getOpcode() == ISD::PARITY;
11534 SDValue Val = Op.getOperand(0);
11535 SDLoc DL(Op);
11536
11537 // for i32, general parity function using EORs is more efficient compared to
11538 // using floating point
11539 if (VT == MVT::i32 && IsParity)
11540 return SDValue();
11541
11542 if (Subtarget->isSVEorStreamingSVEAvailable()) {
11543 if (VT == MVT::i32 || VT == MVT::i64) {
11544 EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;
11545 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
11546 DAG.getUNDEF(ContainerVT), Val,
11547 DAG.getVectorIdxConstant(0, DL));
11548 Val = DAG.getNode(ISD::CTPOP, DL, ContainerVT, Val);
11549 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Val,
11550 DAG.getVectorIdxConstant(0, DL));
11551 if (IsParity)
11552 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11553 return Val;
11554 }
11555
11556 if (VT == MVT::i128) {
11557 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Val);
11558 Val = convertToScalableVector(DAG, MVT::nxv2i64, Val);
11559 Val = DAG.getNode(ISD::CTPOP, DL, MVT::nxv2i64, Val);
11560 Val = convertFromScalableVector(DAG, MVT::v2i64, Val);
11561 Val = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Val);
11562 Val = DAG.getZExtOrTrunc(Val, DL, VT);
11563 if (IsParity)
11564 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11565 return Val;
11566 }
11567 }
11568
11569 if (!Subtarget->isNeonAvailable())
11570 return SDValue();
11571
11572 // If there is no CNT instruction available, GPR popcount can
11573 // be more efficiently lowered to the following sequence that uses
11574 // AdvSIMD registers/instructions as long as the copies to/from
11575 // the AdvSIMD registers are cheap.
11576 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
11577 // CNT V0.8B, V0.8B // 8xbyte pop-counts
11578 // ADDV B0, V0.8B // sum 8xbyte pop-counts
11579 // FMOV X0, D0 // copy result back to integer reg
11580 if (VT == MVT::i32 || VT == MVT::i64) {
11581 if (VT == MVT::i32)
11582 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
11583 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
11584
11585 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
11586 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
11587 AddV = DAG.getNode(AArch64ISD::NVCAST, DL,
11588 VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);
11589 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,
11590 DAG.getConstant(0, DL, MVT::i64));
11591 if (IsParity)
11592 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11593 return AddV;
11594 } else if (VT == MVT::i128) {
11595 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
11596
11597 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
11598 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
11599 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
11600 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),
11601 DAG.getConstant(0, DL, MVT::i64));
11602 AddV = DAG.getZExtOrTrunc(AddV, DL, VT);
11603 if (IsParity)
11604 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11605 return AddV;
11606 }
11607
11608 assert(!IsParity && "ISD::PARITY of vector types not supported");
11609
11610 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
11611 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
11612 "Unexpected type for custom ctpop lowering");
11613
11614 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
11615 Val = DAG.getBitcast(VT8Bit, Val);
11616 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
11617
11618 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
11619 VT.getVectorNumElements() >= 2) {
11620 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
11621 SDValue Zeros = DAG.getConstant(0, DL, DT);
11622 SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
11623
11624 if (VT == MVT::v2i64) {
11625 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11626 Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
11627 } else if (VT == MVT::v2i32) {
11628 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11629 } else if (VT == MVT::v4i32) {
11630 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11631 } else {
11632 llvm_unreachable("Unexpected type for custom ctpop lowering");
11633 }
11634
11635 return Val;
11636 }
11637
11638 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
11639 unsigned EltSize = 8;
11640 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
11641 while (EltSize != VT.getScalarSizeInBits()) {
11642 EltSize *= 2;
11643 NumElts /= 2;
11644 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
11645 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
11646 }
11647
11648 return Val;
11649}
11650
11651SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
11652 EVT VT = Op.getValueType();
11653 assert(VT.isScalableVector() ||
11655 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
11656
11657 SDLoc DL(Op);
11658 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
11659 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
11660}
11661
11662SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
11663 SelectionDAG &DAG) const {
11664
11665 EVT VT = Op.getValueType();
11666 SDLoc DL(Op);
11667 unsigned Opcode = Op.getOpcode();
11668 ISD::CondCode CC;
11669 switch (Opcode) {
11670 default:
11671 llvm_unreachable("Wrong instruction");
11672 case ISD::SMAX:
11673 CC = ISD::SETGT;
11674 break;
11675 case ISD::SMIN:
11676 CC = ISD::SETLT;
11677 break;
11678 case ISD::UMAX:
11679 CC = ISD::SETUGT;
11680 break;
11681 case ISD::UMIN:
11682 CC = ISD::SETULT;
11683 break;
11684 }
11685
11686 // Note: This lowering only overrides NEON for v1i64 and v2i64, where we
11687 // prefer using SVE if available.
11688 if (VT.isScalableVector() ||
11689 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) {
11690 switch (Opcode) {
11691 default:
11692 llvm_unreachable("Wrong instruction");
11693 case ISD::SMAX:
11694 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
11695 case ISD::SMIN:
11696 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
11697 case ISD::UMAX:
11698 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
11699 case ISD::UMIN:
11700 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
11701 }
11702 }
11703
11704 SDValue Op0 = Op.getOperand(0);
11705 SDValue Op1 = Op.getOperand(1);
11706 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
11707 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
11708}
11709
11710SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
11711 SelectionDAG &DAG) const {
11712 EVT VT = Op.getValueType();
11713
11714 if (VT.isScalableVector() ||
11716 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
11717 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
11718
11719 SDLoc DL(Op);
11720 SDValue REVB;
11721 MVT VST;
11722
11723 switch (VT.getSimpleVT().SimpleTy) {
11724 default:
11725 llvm_unreachable("Invalid type for bitreverse!");
11726
11727 case MVT::v2i32: {
11728 VST = MVT::v8i8;
11729 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11730
11731 break;
11732 }
11733
11734 case MVT::v4i32: {
11735 VST = MVT::v16i8;
11736 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11737
11738 break;
11739 }
11740
11741 case MVT::v1i64: {
11742 VST = MVT::v8i8;
11743 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11744
11745 break;
11746 }
11747
11748 case MVT::v2i64: {
11749 VST = MVT::v16i8;
11750 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11751
11752 break;
11753 }
11754 }
11755
11756 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
11757 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
11758}
11759
11760// Check whether the continuous comparison sequence.
11761static bool
11762isOrXorChain(SDValue N, unsigned &Num,
11763 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
11764 if (Num == MaxXors)
11765 return false;
11766
11767 // Skip the one-use zext
11768 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
11769 N = N->getOperand(0);
11770
11771 // The leaf node must be XOR
11772 if (N->getOpcode() == ISD::XOR) {
11773 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
11774 Num++;
11775 return true;
11776 }
11777
11778 // All the non-leaf nodes must be OR.
11779 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
11780 return false;
11781
11782 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
11783 isOrXorChain(N->getOperand(1), Num, WorkList))
11784 return true;
11785 return false;
11786}
11787
11788// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
11790 SDValue LHS = N->getOperand(0);
11791 SDValue RHS = N->getOperand(1);
11792 SDLoc DL(N);
11793 EVT VT = N->getValueType(0);
11795
11796 // Only handle integer compares.
11797 if (N->getOpcode() != ISD::SETCC)
11798 return SDValue();
11799
11800 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
11801 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
11802 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
11803 unsigned NumXors = 0;
11804 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
11805 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
11806 isOrXorChain(LHS, NumXors, WorkList)) {
11807 SDValue XOR0, XOR1;
11808 std::tie(XOR0, XOR1) = WorkList[0];
11809 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
11810 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11811 for (unsigned I = 1; I < WorkList.size(); I++) {
11812 std::tie(XOR0, XOR1) = WorkList[I];
11813 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11814 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
11815 }
11816
11817 // Exit early by inverting the condition, which help reduce indentations.
11818 return Cmp;
11819 }
11820
11821 return SDValue();
11822}
11823
11824SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
11825
11826 if (Op.getValueType().isVector())
11827 return LowerVSETCC(Op, DAG);
11828
11829 bool IsStrict = Op->isStrictFPOpcode();
11830 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
11831 unsigned OpNo = IsStrict ? 1 : 0;
11832 SDValue Chain;
11833 if (IsStrict)
11834 Chain = Op.getOperand(0);
11835 SDValue LHS = Op.getOperand(OpNo + 0);
11836 SDValue RHS = Op.getOperand(OpNo + 1);
11837 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
11838 SDLoc DL(Op);
11839
11840 // We chose ZeroOrOneBooleanContents, so use zero and one.
11841 EVT VT = Op.getValueType();
11842 SDValue TVal = DAG.getConstant(1, DL, VT);
11843 SDValue FVal = DAG.getConstant(0, DL, VT);
11844
11845 // Handle f128 first, since one possible outcome is a normal integer
11846 // comparison which gets picked up by the next if statement.
11847 if (LHS.getValueType() == MVT::f128) {
11848 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS, Chain,
11849 IsSignaling);
11850
11851 // If softenSetCCOperands returned a scalar, use it.
11852 if (!RHS.getNode()) {
11853 assert(LHS.getValueType() == Op.getValueType() &&
11854 "Unexpected setcc expansion!");
11855 return IsStrict ? DAG.getMergeValues({LHS, Chain}, DL) : LHS;
11856 }
11857 }
11858
11859 if (LHS.getValueType().isInteger()) {
11860 if (Subtarget->hasCSSC() && CC == ISD::SETNE && isNullConstant(RHS)) {
11861 SDValue One = DAG.getConstant(1, DL, LHS.getValueType());
11862 SDValue UMin = DAG.getNode(ISD::UMIN, DL, LHS.getValueType(), LHS, One);
11863 SDValue Res = DAG.getZExtOrTrunc(UMin, DL, VT);
11864 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11865 }
11866 simplifySetCCIntoEq(CC, LHS, RHS, DAG, DL);
11867
11868 SDValue CCVal;
11870 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, DL);
11871
11872 // Note that we inverted the condition above, so we reverse the order of
11873 // the true and false operands here. This will allow the setcc to be
11874 // matched to a single CSINC instruction.
11875 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CCVal, Cmp);
11876 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11877 }
11878
11879 // Now we know we're dealing with FP values.
11880 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11881 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11882
11883 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11884 // and do the comparison.
11885 SDValue Cmp;
11886 if (IsStrict)
11887 Cmp = emitStrictFPComparison(LHS, RHS, DL, DAG, Chain, IsSignaling);
11888 else
11889 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11890
11891 AArch64CC::CondCode CC1, CC2;
11892 changeFPCCToAArch64CC(CC, CC1, CC2);
11893 SDValue Res;
11894 if (CC2 == AArch64CC::AL) {
11895 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
11896 CC2);
11897 SDValue CC1Val = getCondCode(DAG, CC1);
11898
11899 // Note that we inverted the condition above, so we reverse the order of
11900 // the true and false operands here. This will allow the setcc to be
11901 // matched to a single CSINC instruction.
11902 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CC1Val, Cmp);
11903 } else {
11904 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
11905 // totally clean. Some of them require two CSELs to implement. As is in
11906 // this case, we emit the first CSEL and then emit a second using the output
11907 // of the first as the RHS. We're effectively OR'ing the two CC's together.
11908
11909 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
11910 SDValue CC1Val = getCondCode(DAG, CC1);
11911 SDValue CS1 =
11912 DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
11913
11914 SDValue CC2Val = getCondCode(DAG, CC2);
11915 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
11916 }
11917 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, DL) : Res;
11918}
11919
11920SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
11921 SelectionDAG &DAG) const {
11922
11923 SDValue LHS = Op.getOperand(0);
11924 SDValue RHS = Op.getOperand(1);
11925 EVT VT = LHS.getValueType();
11926 if (VT != MVT::i32 && VT != MVT::i64)
11927 return SDValue();
11928
11929 SDLoc DL(Op);
11930 SDValue Carry = Op.getOperand(2);
11931 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
11932 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
11933 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT),
11934 LHS, RHS, InvCarry);
11935
11936 EVT OpVT = Op.getValueType();
11937 SDValue TVal = DAG.getConstant(1, DL, OpVT);
11938 SDValue FVal = DAG.getConstant(0, DL, OpVT);
11939
11940 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
11942 SDValue CCVal = getCondCode(DAG, changeIntCCToAArch64CC(CondInv));
11943 // Inputs are swapped because the condition is inverted. This will allow
11944 // matching with a single CSINC instruction.
11945 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
11946 Cmp.getValue(1));
11947}
11948
11949/// Emit vector comparison for floating-point values, producing a mask.
11951 AArch64CC::CondCode CC, bool NoNans, EVT VT,
11952 const SDLoc &DL, SelectionDAG &DAG) {
11953 assert(VT.getSizeInBits() == LHS.getValueType().getSizeInBits() &&
11954 "function only supposed to emit natural comparisons");
11955
11956 switch (CC) {
11957 default:
11958 return SDValue();
11959 case AArch64CC::NE: {
11960 SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
11961 // Use vector semantics for the inversion to potentially save a copy between
11962 // SIMD and regular registers.
11963 if (!LHS.getValueType().isVector()) {
11964 EVT VecVT =
11965 EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
11966 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11967 SDValue MaskVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT,
11968 DAG.getUNDEF(VecVT), Fcmeq, Zero);
11969 SDValue InvertedMask = DAG.getNOT(DL, MaskVec, VecVT);
11970 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, InvertedMask, Zero);
11971 }
11972 return DAG.getNOT(DL, Fcmeq, VT);
11973 }
11974 case AArch64CC::EQ:
11975 return DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
11976 case AArch64CC::GE:
11977 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, LHS, RHS);
11978 case AArch64CC::GT:
11979 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, LHS, RHS);
11980 case AArch64CC::LE:
11981 if (!NoNans)
11982 return SDValue();
11983 // If we ignore NaNs then we can use to the LS implementation.
11984 [[fallthrough]];
11985 case AArch64CC::LS:
11986 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, RHS, LHS);
11987 case AArch64CC::LT:
11988 if (!NoNans)
11989 return SDValue();
11990 // If we ignore NaNs then we can use to the MI implementation.
11991 [[fallthrough]];
11992 case AArch64CC::MI:
11993 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, RHS, LHS);
11994 }
11995}
11996
11997/// For SELECT_CC, when the true/false values are (-1, 0) and the compared
11998/// values are scalars, try to emit a mask generating vector instruction.
12000 SDValue FVal, ISD::CondCode CC, bool NoNaNs,
12001 const SDLoc &DL, SelectionDAG &DAG) {
12002 assert(!LHS.getValueType().isVector());
12003 assert(!RHS.getValueType().isVector());
12004
12005 auto *CTVal = dyn_cast<ConstantSDNode>(TVal);
12006 auto *CFVal = dyn_cast<ConstantSDNode>(FVal);
12007 if (!CTVal || !CFVal)
12008 return {};
12009 if (!(CTVal->isAllOnes() && CFVal->isZero()) &&
12010 !(CTVal->isZero() && CFVal->isAllOnes()))
12011 return {};
12012
12013 if (CTVal->isZero())
12014 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12015
12016 EVT VT = TVal.getValueType();
12017 if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits())
12018 return {};
12019
12020 if (!NoNaNs && (CC == ISD::SETUO || CC == ISD::SETO)) {
12021 bool OneNaN = false;
12022 if (LHS == RHS) {
12023 OneNaN = true;
12024 } else if (DAG.isKnownNeverNaN(RHS)) {
12025 OneNaN = true;
12026 RHS = LHS;
12027 } else if (DAG.isKnownNeverNaN(LHS)) {
12028 OneNaN = true;
12029 LHS = RHS;
12030 }
12031 if (OneNaN)
12032 CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ;
12033 }
12034
12037 bool ShouldInvert = false;
12038 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
12039 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, DL, DAG);
12040 SDValue Cmp2;
12041 if (CC2 != AArch64CC::AL) {
12042 Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, VT, DL, DAG);
12043 if (!Cmp2)
12044 return {};
12045 }
12046 if (!Cmp2 && !ShouldInvert)
12047 return Cmp;
12048
12049 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
12050 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12051 Cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT), Cmp,
12052 Zero);
12053 if (Cmp2) {
12054 Cmp2 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT),
12055 Cmp2, Zero);
12056 Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp, Cmp2);
12057 }
12058 if (ShouldInvert)
12059 Cmp = DAG.getNOT(DL, Cmp, VecVT);
12060 Cmp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cmp, Zero);
12061 return Cmp;
12062}
12063
12064SDValue AArch64TargetLowering::LowerSELECT_CC(
12067 const SDLoc &DL, SelectionDAG &DAG) const {
12068 // Handle f128 first, because it will result in a comparison of some RTLIB
12069 // call result against zero.
12070 if (LHS.getValueType() == MVT::f128) {
12071 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
12072
12073 // If softenSetCCOperands returned a scalar, we need to compare the result
12074 // against zero to select between true and false values.
12075 if (!RHS.getNode()) {
12076 RHS = DAG.getConstant(0, DL, LHS.getValueType());
12077 CC = ISD::SETNE;
12078 }
12079 }
12080
12081 // Also handle f16, for which we need to do a f32 comparison.
12082 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
12083 LHS.getValueType() == MVT::bf16) {
12084 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
12085 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
12086 }
12087
12088 // Next, handle integers.
12089 if (LHS.getValueType().isInteger()) {
12090 assert((LHS.getValueType() == RHS.getValueType()) &&
12091 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
12092
12093 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
12094 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
12095 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
12096
12097 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
12098 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
12099 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
12100 // Both require less instructions than compare and conditional select.
12101 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
12102 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
12103 LHS.getValueType() == RHS.getValueType()) {
12104 EVT VT = LHS.getValueType();
12105 SDValue Shift =
12106 DAG.getNode(ISD::SRA, DL, VT, LHS,
12107 DAG.getConstant(VT.getSizeInBits() - 1, DL, VT));
12108
12109 if (CC == ISD::SETGT)
12110 Shift = DAG.getNOT(DL, Shift, VT);
12111
12112 return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
12113 }
12114
12115 // Check for sign bit test patterns that can use TST optimization.
12116 // (SELECT_CC setlt, sign_extend_inreg, 0, tval, fval)
12117 // -> TST %operand, sign_bit; CSEL
12118 // (SELECT_CC setlt, sign_extend, 0, tval, fval)
12119 // -> TST %operand, sign_bit; CSEL
12120 if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() &&
12121 (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG ||
12122 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
12123
12124 uint64_t SignBitPos;
12125 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
12126 EVT TestVT = LHS.getValueType();
12127 SDValue SignBitConst = DAG.getConstant(1ULL << SignBitPos, DL, TestVT);
12128 SDValue TST =
12129 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32),
12130 LHS, SignBitConst);
12131
12132 SDValue Flags = TST.getValue(1);
12133 return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal, FVal,
12134 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), Flags);
12135 }
12136
12137 // Canonicalise absolute difference patterns:
12138 // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
12139 // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
12140 //
12141 // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc ->
12142 // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc
12143 // The second forms can be matched into subs+cneg.
12144 // NOTE: Drop poison generating flags from the negated operand to avoid
12145 // inadvertently propagating poison after the canonicalisation.
12146 if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) {
12147 if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS &&
12148 FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) {
12150 FVal = DAG.getNegative(TVal, DL, TVal.getValueType());
12151 } else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&
12152 FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) {
12154 TVal = DAG.getNegative(FVal, DL, FVal.getValueType());
12155 }
12156 }
12157
12158 unsigned Opcode = AArch64ISD::CSEL;
12159
12160 // If both the TVal and the FVal are constants, see if we can swap them in
12161 // order to for a CSINV or CSINC out of them.
12162 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
12163 std::swap(TVal, FVal);
12164 std::swap(CTVal, CFVal);
12165 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12166 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
12167 std::swap(TVal, FVal);
12168 std::swap(CTVal, CFVal);
12169 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12170 } else if (TVal.getOpcode() == ISD::XOR) {
12171 // If TVal is a NOT we want to swap TVal and FVal so that we can match
12172 // with a CSINV rather than a CSEL.
12173 if (isAllOnesConstant(TVal.getOperand(1))) {
12174 std::swap(TVal, FVal);
12175 std::swap(CTVal, CFVal);
12176 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12177 }
12178 } else if (TVal.getOpcode() == ISD::SUB) {
12179 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
12180 // that we can match with a CSNEG rather than a CSEL.
12181 if (isNullConstant(TVal.getOperand(0))) {
12182 std::swap(TVal, FVal);
12183 std::swap(CTVal, CFVal);
12184 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12185 }
12186 } else if (CTVal && CFVal) {
12187 const int64_t TrueVal = CTVal->getSExtValue();
12188 const int64_t FalseVal = CFVal->getSExtValue();
12189 bool Swap = false;
12190
12191 // If both TVal and FVal are constants, see if FVal is the
12192 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
12193 // instead of a CSEL in that case.
12194 if (TrueVal == ~FalseVal) {
12195 Opcode = AArch64ISD::CSINV;
12196 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
12197 TrueVal == -FalseVal) {
12198 Opcode = AArch64ISD::CSNEG;
12199 } else if (TVal.getValueType() == MVT::i32) {
12200 // If our operands are only 32-bit wide, make sure we use 32-bit
12201 // arithmetic for the check whether we can use CSINC. This ensures that
12202 // the addition in the check will wrap around properly in case there is
12203 // an overflow (which would not be the case if we do the check with
12204 // 64-bit arithmetic).
12205 const uint32_t TrueVal32 = CTVal->getZExtValue();
12206 const uint32_t FalseVal32 = CFVal->getZExtValue();
12207
12208 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
12209 Opcode = AArch64ISD::CSINC;
12210
12211 if (TrueVal32 > FalseVal32) {
12212 Swap = true;
12213 }
12214 }
12215 } else {
12216 // 64-bit check whether we can use CSINC.
12217 const uint64_t TrueVal64 = TrueVal;
12218 const uint64_t FalseVal64 = FalseVal;
12219
12220 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
12221 Opcode = AArch64ISD::CSINC;
12222
12223 if (TrueVal > FalseVal) {
12224 Swap = true;
12225 }
12226 }
12227 }
12228
12229 // Swap TVal and FVal if necessary.
12230 if (Swap) {
12231 std::swap(TVal, FVal);
12232 std::swap(CTVal, CFVal);
12233 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12234 }
12235
12236 if (Opcode != AArch64ISD::CSEL) {
12237 // Drop FVal since we can get its value by simply inverting/negating
12238 // TVal.
12239 FVal = TVal;
12240 }
12241 }
12242
12243 // Avoid materializing a constant when possible by reusing a known value in
12244 // a register. However, don't perform this optimization if the known value
12245 // is one, zero or negative one in the case of a CSEL. We can always
12246 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
12247 // FVal, respectively.
12248 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
12249 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
12250 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
12252 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
12253 // "a != C ? x : a" to avoid materializing C.
12254 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
12255 TVal = LHS;
12256 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
12257 FVal = LHS;
12258 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
12259 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
12260 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
12261 // avoid materializing C.
12263 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
12264 Opcode = AArch64ISD::CSINV;
12265 TVal = LHS;
12266 FVal = DAG.getConstant(0, DL, FVal.getValueType());
12267 }
12268 }
12269
12270 SDValue CCVal;
12271 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
12272 EVT VT = TVal.getValueType();
12273 return DAG.getNode(Opcode, DL, VT, TVal, FVal, CCVal, Cmp);
12274 }
12275
12276 // Now we know we're dealing with FP values.
12277 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
12278 LHS.getValueType() == MVT::f64);
12279 assert(LHS.getValueType() == RHS.getValueType());
12280 EVT VT = TVal.getValueType();
12281
12282 // If the purpose of the comparison is to select between all ones
12283 // or all zeros, try to use a vector comparison because the operands are
12284 // already stored in SIMD registers.
12285 if (Subtarget->isNeonAvailable() && all_of(Users, [](const SDNode *U) {
12286 switch (U->getOpcode()) {
12287 default:
12288 return false;
12291 case AArch64ISD::DUP:
12292 return true;
12293 }
12294 })) {
12295 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs();
12296 SDValue VectorCmp =
12297 emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
12298 if (VectorCmp)
12299 return VectorCmp;
12300 }
12301
12302 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
12303
12304 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
12305 // clean. Some of them require two CSELs to implement.
12306 AArch64CC::CondCode CC1, CC2;
12307 changeFPCCToAArch64CC(CC, CC1, CC2);
12308
12309 if (Flags.hasNoSignedZeros()) {
12310 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
12311 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
12312 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
12313 if (RHSVal && RHSVal->isZero()) {
12314 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
12315 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
12316
12317 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
12318 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
12319 TVal = LHS;
12320 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
12321 CFVal && CFVal->isZero() &&
12322 FVal.getValueType() == LHS.getValueType())
12323 FVal = LHS;
12324 }
12325 }
12326
12327 // Emit first, and possibly only, CSEL.
12328 SDValue CC1Val = getCondCode(DAG, CC1);
12329 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
12330
12331 // If we need a second CSEL, emit it, using the output of the first as the
12332 // RHS. We're effectively OR'ing the two CC's together.
12333 if (CC2 != AArch64CC::AL) {
12334 SDValue CC2Val = getCondCode(DAG, CC2);
12335 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
12336 }
12337
12338 // Otherwise, return the output of the first CSEL.
12339 return CS1;
12340}
12341
12342SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
12343 SelectionDAG &DAG) const {
12344 EVT Ty = Op.getValueType();
12345 auto Idx = Op.getConstantOperandAPInt(2);
12346 int64_t IdxVal = Idx.getSExtValue();
12347 assert(Ty.isScalableVector() &&
12348 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
12349
12350 // We can use the splice instruction for certain index values where we are
12351 // able to efficiently generate the correct predicate. The index will be
12352 // inverted and used directly as the input to the ptrue instruction, i.e.
12353 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
12354 // splice predicate. However, we can only do this if we can guarantee that
12355 // there are enough elements in the vector, hence we check the index <= min
12356 // number of elements.
12357 std::optional<unsigned> PredPattern;
12358 if (Ty.isScalableVector() && IdxVal < 0 &&
12359 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
12360 std::nullopt) {
12361 SDLoc DL(Op);
12362
12363 // Create a predicate where all but the last -IdxVal elements are false.
12364 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
12365 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
12366 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
12367
12368 // Now splice the two inputs together using the predicate.
12369 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
12370 Op.getOperand(1));
12371 }
12372
12373 // We can select to an EXT instruction when indexing the first 256 bytes.
12375 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
12376 return Op;
12377
12378 return SDValue();
12379}
12380
12381SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
12382 SelectionDAG &DAG) const {
12383 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
12384 SDValue LHS = Op.getOperand(0);
12385 SDValue RHS = Op.getOperand(1);
12386 SDValue TVal = Op.getOperand(2);
12387 SDValue FVal = Op.getOperand(3);
12388 SDNodeFlags Flags = Op->getFlags();
12389 SDLoc DL(Op);
12390 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);
12391}
12392
12393SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
12394 SelectionDAG &DAG) const {
12395 SDValue CCVal = Op->getOperand(0);
12396 SDValue TVal = Op->getOperand(1);
12397 SDValue FVal = Op->getOperand(2);
12398 SDLoc DL(Op);
12399
12400 EVT Ty = Op.getValueType();
12401 if (Ty == MVT::aarch64svcount) {
12402 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
12403 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
12404 SDValue Sel =
12405 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
12406 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
12407 }
12408
12409 if (Ty.isScalableVector()) {
12410 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
12411 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
12412 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12413 }
12414
12415 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
12416 // FIXME: Ideally this would be the same as above using i1 types, however
12417 // for the moment we can't deal with fixed i1 vector types properly, so
12418 // instead extend the predicate to a result type sized integer vector.
12419 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
12420 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
12421 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
12422 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
12423 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12424 }
12425
12426 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
12427 // instruction.
12428 if (ISD::isOverflowIntrOpRes(CCVal)) {
12429 // Only lower legal XALUO ops.
12430 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
12431 return SDValue();
12432
12434 SDValue Value, Overflow;
12435 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
12436 SDValue CCVal = getCondCode(DAG, OFCC);
12437
12438 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
12439 CCVal, Overflow);
12440 }
12441
12442 // Lower it the same way as we would lower a SELECT_CC node.
12443 ISD::CondCode CC;
12444 SDValue LHS, RHS;
12445 if (CCVal.getOpcode() == ISD::SETCC) {
12446 LHS = CCVal.getOperand(0);
12447 RHS = CCVal.getOperand(1);
12448 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
12449 } else {
12450 LHS = CCVal;
12451 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
12452 CC = ISD::SETNE;
12453 }
12454
12455 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
12456 // order to use FCSELSrrr
12457 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12458 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
12459 DAG.getUNDEF(MVT::f32), TVal);
12460 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
12461 DAG.getUNDEF(MVT::f32), FVal);
12462 }
12463
12464 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),
12465 Op->getFlags(), DL, DAG);
12466
12467 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12468 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
12469 }
12470
12471 return Res;
12472}
12473
12474SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
12475 SelectionDAG &DAG) const {
12476 // Jump table entries as PC relative offsets. No additional tweaking
12477 // is necessary here. Just get the address of the jump table.
12478 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
12479
12482 !Subtarget->isTargetMachO())
12483 return getAddrLarge(JT, DAG);
12484 if (CM == CodeModel::Tiny)
12485 return getAddrTiny(JT, DAG);
12486 return getAddr(JT, DAG);
12487}
12488
12489SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
12490 SelectionDAG &DAG) const {
12491 // Jump table entries as PC relative offsets. No additional tweaking
12492 // is necessary here. Just get the address of the jump table.
12493 SDLoc DL(Op);
12494 SDValue JT = Op.getOperand(1);
12495 SDValue Entry = Op.getOperand(2);
12496 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
12497
12498 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12499 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
12500
12501 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
12502 // sequence later, to guarantee the integrity of the intermediate values.
12504 "aarch64-jump-table-hardening")) {
12506 if (Subtarget->isTargetMachO()) {
12507 if (CM != CodeModel::Small && CM != CodeModel::Large)
12508 report_fatal_error("Unsupported code-model for hardened jump-table");
12509 } else {
12510 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
12511 assert(Subtarget->isTargetELF() &&
12512 "jump table hardening only supported on MachO/ELF");
12513 if (CM != CodeModel::Small)
12514 report_fatal_error("Unsupported code-model for hardened jump-table");
12515 }
12516
12517 SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
12518 Entry, SDValue());
12519 SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
12520 DAG.getTargetJumpTable(JTI, MVT::i32),
12521 X16Copy.getValue(0), X16Copy.getValue(1));
12522 return SDValue(B, 0);
12523 }
12524
12525 SDNode *Dest =
12526 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
12527 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
12528 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
12529 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
12530}
12531
12532SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
12533 SDValue Chain = Op.getOperand(0);
12534 SDValue Dest = Op.getOperand(1);
12535
12536 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
12537 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
12538 if (Dest->isMachineOpcode() &&
12539 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
12540 return SDValue();
12541
12542 const MachineFunction &MF = DAG.getMachineFunction();
12543 std::optional<uint16_t> BADisc =
12544 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(MF.getFunction());
12545 if (!BADisc)
12546 return SDValue();
12547
12548 SDLoc DL(Op);
12549
12550 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12552 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12553
12554 SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
12555 {Dest, Key, Disc, AddrDisc, Chain});
12556 return SDValue(BrA, 0);
12557}
12558
12559SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
12560 SelectionDAG &DAG) const {
12561 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
12563 if (CM == CodeModel::Large) {
12564 // Use the GOT for the large code model on iOS.
12565 if (Subtarget->isTargetMachO()) {
12566 return getGOT(CP, DAG);
12567 }
12569 return getAddrLarge(CP, DAG);
12570 } else if (CM == CodeModel::Tiny) {
12571 return getAddrTiny(CP, DAG);
12572 }
12573 return getAddr(CP, DAG);
12574}
12575
12576SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
12577 SelectionDAG &DAG) const {
12578 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
12579 const BlockAddress *BA = BAN->getBlockAddress();
12580
12581 if (std::optional<uint16_t> BADisc =
12582 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(
12583 *BA->getFunction())) {
12584 SDLoc DL(Op);
12585
12586 // This isn't cheap, but BRIND is rare.
12587 SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
12588
12589 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12590
12592 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12593
12594 SDNode *MOV =
12595 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
12596 {TargetBA, Key, AddrDisc, Disc});
12597 return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
12598 SDValue(MOV, 1));
12599 }
12600
12602 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
12604 return getAddrLarge(BAN, DAG);
12605 } else if (CM == CodeModel::Tiny) {
12606 return getAddrTiny(BAN, DAG);
12607 }
12608 return getAddr(BAN, DAG);
12609}
12610
12611SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
12612 SelectionDAG &DAG) const {
12613 AArch64FunctionInfo *FuncInfo =
12614 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12615
12616 SDLoc DL(Op);
12617 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
12619 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
12620 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12621 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12622 MachinePointerInfo(SV));
12623}
12624
12625SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
12626 SelectionDAG &DAG) const {
12627 MachineFunction &MF = DAG.getMachineFunction();
12628 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12629
12630 SDLoc DL(Op);
12631 SDValue FR;
12632 if (Subtarget->isWindowsArm64EC()) {
12633 // With the Arm64EC ABI, we compute the address of the varargs save area
12634 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
12635 // but calls from an entry thunk can pass in a different address.
12636 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
12637 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
12638 uint64_t StackOffset;
12639 if (FuncInfo->getVarArgsGPRSize() > 0)
12640 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
12641 else
12642 StackOffset = FuncInfo->getVarArgsStackOffset();
12643 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
12644 DAG.getConstant(StackOffset, DL, MVT::i64));
12645 } else {
12646 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
12647 ? FuncInfo->getVarArgsGPRIndex()
12648 : FuncInfo->getVarArgsStackIndex(),
12650 }
12651 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12652 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12653 MachinePointerInfo(SV));
12654}
12655
12656SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
12657 SelectionDAG &DAG) const {
12658 // The layout of the va_list struct is specified in the AArch64 Procedure Call
12659 // Standard, section B.3.
12660 MachineFunction &MF = DAG.getMachineFunction();
12661 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12662 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12663 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12664 auto PtrVT = getPointerTy(DAG.getDataLayout());
12665 SDLoc DL(Op);
12666
12667 SDValue Chain = Op.getOperand(0);
12668 SDValue VAList = Op.getOperand(1);
12669 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12671
12672 // void *__stack at offset 0
12673 unsigned Offset = 0;
12674 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
12675 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
12676 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
12677 MachinePointerInfo(SV), Align(PtrSize)));
12678
12679 // void *__gr_top at offset 8 (4 on ILP32)
12680 Offset += PtrSize;
12681 int GPRSize = FuncInfo->getVarArgsGPRSize();
12682 if (GPRSize > 0) {
12683 SDValue GRTop, GRTopAddr;
12684
12685 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12686 DAG.getConstant(Offset, DL, PtrVT));
12687
12688 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
12689 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
12690 DAG.getSignedConstant(GPRSize, DL, PtrVT));
12691 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
12692
12693 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
12694 MachinePointerInfo(SV, Offset),
12695 Align(PtrSize)));
12696 }
12697
12698 // void *__vr_top at offset 16 (8 on ILP32)
12699 Offset += PtrSize;
12700 int FPRSize = FuncInfo->getVarArgsFPRSize();
12701 if (FPRSize > 0) {
12702 SDValue VRTop, VRTopAddr;
12703 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12704 DAG.getConstant(Offset, DL, PtrVT));
12705
12706 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
12707 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
12708 DAG.getSignedConstant(FPRSize, DL, PtrVT));
12709 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
12710
12711 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
12712 MachinePointerInfo(SV, Offset),
12713 Align(PtrSize)));
12714 }
12715
12716 // int __gr_offs at offset 24 (12 on ILP32)
12717 Offset += PtrSize;
12718 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12719 DAG.getConstant(Offset, DL, PtrVT));
12720 MemOps.push_back(
12721 DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),
12722 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12723
12724 // int __vr_offs at offset 28 (16 on ILP32)
12725 Offset += 4;
12726 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12727 DAG.getConstant(Offset, DL, PtrVT));
12728 MemOps.push_back(
12729 DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),
12730 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12731
12732 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
12733}
12734
12735SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
12736 SelectionDAG &DAG) const {
12737 MachineFunction &MF = DAG.getMachineFunction();
12738 Function &F = MF.getFunction();
12739
12740 if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
12741 return LowerWin64_VASTART(Op, DAG);
12742 else if (Subtarget->isTargetDarwin())
12743 return LowerDarwin_VASTART(Op, DAG);
12744 else
12745 return LowerAAPCS_VASTART(Op, DAG);
12746}
12747
12748SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
12749 SelectionDAG &DAG) const {
12750 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
12751 // pointer.
12752 SDLoc DL(Op);
12753 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12754 unsigned VaListSize =
12755 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
12756 ? PtrSize
12757 : Subtarget->isTargetILP32() ? 20 : 32;
12758 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
12759 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
12760
12761 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
12762 DAG.getConstant(VaListSize, DL, MVT::i32),
12763 Align(PtrSize), false, false, /*CI=*/nullptr,
12764 std::nullopt, MachinePointerInfo(DestSV),
12765 MachinePointerInfo(SrcSV));
12766}
12767
12768SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
12769 assert(Subtarget->isTargetDarwin() &&
12770 "automatic va_arg instruction only works on Darwin");
12771
12772 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12773 EVT VT = Op.getValueType();
12774 SDLoc DL(Op);
12775 SDValue Chain = Op.getOperand(0);
12776 SDValue Addr = Op.getOperand(1);
12777 MaybeAlign Align(Op.getConstantOperandVal(3));
12778 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
12779 auto PtrVT = getPointerTy(DAG.getDataLayout());
12780 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12781 SDValue VAList =
12782 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
12783 Chain = VAList.getValue(1);
12784 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
12785
12786 if (VT.isScalableVector())
12787 report_fatal_error("Passing SVE types to variadic functions is "
12788 "currently not supported");
12789
12790 if (Align && *Align > MinSlotSize) {
12791 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12792 DAG.getConstant(Align->value() - 1, DL, PtrVT));
12793 VAList =
12794 DAG.getNode(ISD::AND, DL, PtrVT, VAList,
12795 DAG.getSignedConstant(-(int64_t)Align->value(), DL, PtrVT));
12796 }
12797
12798 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
12799 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
12800
12801 // Scalar integer and FP values smaller than 64 bits are implicitly extended
12802 // up to 64 bits. At the very least, we have to increase the striding of the
12803 // vaargs list to match this, and for FP values we need to introduce
12804 // FP_ROUND nodes as well.
12805 if (VT.isInteger() && !VT.isVector())
12806 ArgSize = std::max(ArgSize, MinSlotSize);
12807 bool NeedFPTrunc = false;
12808 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
12809 ArgSize = 8;
12810 NeedFPTrunc = true;
12811 }
12812
12813 // Increment the pointer, VAList, to the next vaarg
12814 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12815 DAG.getConstant(ArgSize, DL, PtrVT));
12816 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
12817
12818 // Store the incremented VAList to the legalized pointer
12819 SDValue APStore =
12820 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
12821
12822 // Load the actual argument out of the pointer VAList
12823 if (NeedFPTrunc) {
12824 // Load the value as an f64.
12825 SDValue WideFP =
12826 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
12827 // Round the value down to an f32.
12828 SDValue NarrowFP =
12829 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
12830 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
12831 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
12832 // Merge the rounded value with the chain output of the load.
12833 return DAG.getMergeValues(Ops, DL);
12834 }
12835
12836 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
12837}
12838
12839SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
12840 SelectionDAG &DAG) const {
12841 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12842 MFI.setFrameAddressIsTaken(true);
12843
12844 EVT VT = Op.getValueType();
12845 SDLoc DL(Op);
12846 unsigned Depth = Op.getConstantOperandVal(0);
12847 SDValue FrameAddr =
12848 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
12849 while (Depth--)
12850 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
12851 MachinePointerInfo());
12852
12853 if (Subtarget->isTargetILP32())
12854 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
12855 DAG.getValueType(VT));
12856
12857 return FrameAddr;
12858}
12859
12860SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
12861 SelectionDAG &DAG) const {
12862 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12863
12864 EVT VT = getPointerTy(DAG.getDataLayout());
12865 int FI = MFI.CreateFixedObject(4, 0, false);
12866 return DAG.getFrameIndex(FI, VT);
12867}
12868
12869#define GET_REGISTER_MATCHER
12870#include "AArch64GenAsmMatcher.inc"
12871
12872// FIXME? Maybe this could be a TableGen attribute on some registers and
12873// this table could be generated automatically from RegInfo.
12874Register AArch64TargetLowering::
12875getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
12877 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
12878 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
12879 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
12880 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
12881 !MRI->isReservedReg(MF, Reg))
12882 Reg = Register();
12883 }
12884 return Reg;
12885}
12886
12887SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
12888 SelectionDAG &DAG) const {
12890
12891 EVT VT = Op.getValueType();
12892 SDLoc DL(Op);
12893
12894 SDValue FrameAddr =
12895 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
12897
12898 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
12899}
12900
12901SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
12902 SelectionDAG &DAG) const {
12903 MachineFunction &MF = DAG.getMachineFunction();
12904 MachineFrameInfo &MFI = MF.getFrameInfo();
12905 MFI.setReturnAddressIsTaken(true);
12906
12907 EVT VT = Op.getValueType();
12908 SDLoc DL(Op);
12909 unsigned Depth = Op.getConstantOperandVal(0);
12910 SDValue ReturnAddress;
12911 if (Depth) {
12912 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
12914 ReturnAddress = DAG.getLoad(
12915 VT, DL, DAG.getEntryNode(),
12916 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
12917 } else {
12918 // Return LR, which contains the return address. Mark it an implicit
12919 // live-in.
12920 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
12921 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
12922 }
12923
12924 // The XPACLRI instruction assembles to a hint-space instruction before
12925 // Armv8.3-A therefore this instruction can be safely used for any pre
12926 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
12927 // that instead.
12928 SDNode *St;
12929 if (Subtarget->hasPAuth()) {
12930 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
12931 } else {
12932 // XPACLRI operates on LR therefore we must move the operand accordingly.
12933 SDValue Chain =
12934 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
12935 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
12936 }
12937 return SDValue(St, 0);
12938}
12939
12940/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
12941/// i32 values and take a 2 x i32 value to shift plus a shift amount.
12942SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
12943 SelectionDAG &DAG) const {
12944 SDValue Lo, Hi;
12945 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
12946 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
12947}
12948
12950 const GlobalAddressSDNode *GA) const {
12951 // Offsets are folded in the DAG combine rather than here so that we can
12952 // intelligently choose an offset based on the uses.
12953 return false;
12954}
12955
12957 bool OptForSize) const {
12958 bool IsLegal = false;
12959 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
12960 // 16-bit case when target has full fp16 support.
12961 // We encode bf16 bit patterns as if they were fp16. This results in very
12962 // strange looking assembly but should populate the register with appropriate
12963 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
12964 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
12965 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
12966 // FIXME: We should be able to handle f128 as well with a clever lowering.
12967 const APInt ImmInt = Imm.bitcastToAPInt();
12968 if (VT == MVT::f64)
12969 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
12970 else if (VT == MVT::f32)
12971 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
12972 else if (VT == MVT::f16 || VT == MVT::bf16)
12973 IsLegal =
12974 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
12975 Imm.isPosZero();
12976
12977 // If we can not materialize in immediate field for fmov, check if the
12978 // value can be encoded as the immediate operand of a logical instruction.
12979 // The immediate value will be created with either MOVZ, MOVN, or ORR.
12980 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
12981 // generate that fmov.
12982 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
12983 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
12984 // however the mov+fmov sequence is always better because of the reduced
12985 // cache pressure. The timings are still the same if you consider
12986 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
12987 // movw+movk is fused). So we limit up to 2 instrdduction at most.
12990 assert(Insn.size() <= 4 &&
12991 "Should be able to build any value with at most 4 moves");
12992 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
12993 IsLegal = Insn.size() <= Limit;
12994 }
12995
12996 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
12997 << " imm value: "; Imm.dump(););
12998 return IsLegal;
12999}
13000
13001//===----------------------------------------------------------------------===//
13002// AArch64 Optimization Hooks
13003//===----------------------------------------------------------------------===//
13004
13005static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
13006 SDValue Operand, SelectionDAG &DAG,
13007 int &ExtraSteps) {
13008 EVT VT = Operand.getValueType();
13009 if ((ST->hasNEON() &&
13010 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
13011 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
13012 VT == MVT::v4f32)) ||
13013 (ST->hasSVE() &&
13014 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
13016 // For the reciprocal estimates, convergence is quadratic, so the number
13017 // of digits is doubled after each iteration. In ARMv8, the accuracy of
13018 // the initial estimate is 2^-8. Thus the number of extra steps to refine
13019 // the result for float (23 mantissa bits) is 2 and for double (52
13020 // mantissa bits) is 3.
13021 constexpr unsigned AccurateBits = 8;
13022 unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
13023 ExtraSteps = DesiredBits <= AccurateBits
13024 ? 0
13025 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
13026 }
13027
13028 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
13029 }
13030
13031 return SDValue();
13032}
13033
13034SDValue
13035AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
13036 const DenormalMode &Mode) const {
13037 SDLoc DL(Op);
13038 EVT VT = Op.getValueType();
13039 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
13040 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
13041 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
13042}
13043
13044SDValue
13045AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
13046 SelectionDAG &DAG) const {
13047 return Op;
13048}
13049
13050SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
13051 SelectionDAG &DAG, int Enabled,
13052 int &ExtraSteps,
13053 bool &UseOneConst,
13054 bool Reciprocal) const {
13056 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
13057 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
13058 DAG, ExtraSteps)) {
13059 SDLoc DL(Operand);
13060 EVT VT = Operand.getValueType();
13061
13062 // Ensure nodes can be recognized by isAssociativeAndCommutative.
13063 SDNodeFlags Flags =
13065
13066 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
13067 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
13068 for (int i = ExtraSteps; i > 0; --i) {
13069 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
13070 Flags);
13071 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
13072 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
13073 }
13074 if (!Reciprocal)
13075 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
13076
13077 ExtraSteps = 0;
13078 return Estimate;
13079 }
13080
13081 return SDValue();
13082}
13083
13084SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
13085 SelectionDAG &DAG, int Enabled,
13086 int &ExtraSteps) const {
13088 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
13089 DAG, ExtraSteps)) {
13090 SDLoc DL(Operand);
13091 EVT VT = Operand.getValueType();
13092
13094
13095 // Newton reciprocal iteration: E * (2 - X * E)
13096 // AArch64 reciprocal iteration instruction: (2 - M * N)
13097 for (int i = ExtraSteps; i > 0; --i) {
13098 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
13099 Estimate, Flags);
13100 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
13101 }
13102
13103 ExtraSteps = 0;
13104 return Estimate;
13105 }
13106
13107 return SDValue();
13108}
13109
13110//===----------------------------------------------------------------------===//
13111// AArch64 Inline Assembly Support
13112//===----------------------------------------------------------------------===//
13113
13114// Table of Constraints
13115// TODO: This is the current set of constraints supported by ARM for the
13116// compiler, not all of them may make sense.
13117//
13118// r - A general register
13119// w - An FP/SIMD register of some size in the range v0-v31
13120// x - An FP/SIMD register of some size in the range v0-v15
13121// I - Constant that can be used with an ADD instruction
13122// J - Constant that can be used with a SUB instruction
13123// K - Constant that can be used with a 32-bit logical instruction
13124// L - Constant that can be used with a 64-bit logical instruction
13125// M - Constant that can be used as a 32-bit MOV immediate
13126// N - Constant that can be used as a 64-bit MOV immediate
13127// Q - A memory reference with base register and no offset
13128// S - A symbolic address
13129// Y - Floating point constant zero
13130// Z - Integer constant zero
13131//
13132// Note that general register operands will be output using their 64-bit x
13133// register name, whatever the size of the variable, unless the asm operand
13134// is prefixed by the %w modifier. Floating-point and SIMD register operands
13135// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
13136// %q modifier.
13137const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
13138 // At this point, we have to lower this constraint to something else, so we
13139 // lower it to an "r" or "w". However, by doing this we will force the result
13140 // to be in register, while the X constraint is much more permissive.
13141 //
13142 // Although we are correct (we are free to emit anything, without
13143 // constraints), we might break use cases that would expect us to be more
13144 // efficient and emit something else.
13145 if (!Subtarget->hasFPARMv8())
13146 return "r";
13147
13148 if (ConstraintVT.isFloatingPoint())
13149 return "w";
13150
13151 if (ConstraintVT.isVector() &&
13152 (ConstraintVT.getSizeInBits() == 64 ||
13153 ConstraintVT.getSizeInBits() == 128))
13154 return "w";
13155
13156 return "r";
13157}
13158
13160
13161// Returns a {Reg, RegisterClass} tuple if the constraint is
13162// a specific predicate register.
13163//
13164// For some constraint like "{pn3}" the default path in
13165// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
13166// suitable register class for this register is "PPRorPNR", after which it
13167// determines that nxv16i1 is an appropriate type for the constraint, which is
13168// not what we want. The code here pre-empts this by matching the register
13169// explicitly.
13170static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
13172 if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
13173 (Constraint[1] != 'p' && Constraint[1] != 'z'))
13174 return std::nullopt;
13175
13176 bool IsPredicate = Constraint[1] == 'p';
13177 Constraint = Constraint.substr(2, Constraint.size() - 3);
13178 bool IsPredicateAsCount = IsPredicate && Constraint.starts_with("n");
13179 if (IsPredicateAsCount)
13180 Constraint = Constraint.drop_front(1);
13181
13182 unsigned V;
13183 if (Constraint.getAsInteger(10, V) || V > 31)
13184 return std::nullopt;
13185
13186 if (IsPredicateAsCount)
13187 return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
13188 if (IsPredicate)
13189 return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
13190 return std::make_pair(AArch64::Z0 + V, &AArch64::ZPRRegClass);
13191}
13192
13193static std::optional<PredicateConstraint>
13196 .Case("Uph", PredicateConstraint::Uph)
13199 .Default(std::nullopt);
13200}
13201
13202static const TargetRegisterClass *
13204 if (VT != MVT::aarch64svcount &&
13205 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
13206 return nullptr;
13207
13208 switch (Constraint) {
13210 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
13211 : &AArch64::PPR_p8to15RegClass;
13213 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
13214 : &AArch64::PPR_3bRegClass;
13216 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
13217 : &AArch64::PPRRegClass;
13218 }
13219
13220 llvm_unreachable("Missing PredicateConstraint!");
13221}
13222
13224
13225static std::optional<ReducedGprConstraint>
13228 .Case("Uci", ReducedGprConstraint::Uci)
13230 .Default(std::nullopt);
13231}
13232
13233static const TargetRegisterClass *
13235 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
13236 return nullptr;
13237
13238 switch (Constraint) {
13240 return &AArch64::MatrixIndexGPR32_8_11RegClass;
13242 return &AArch64::MatrixIndexGPR32_12_15RegClass;
13243 }
13244
13245 llvm_unreachable("Missing ReducedGprConstraint!");
13246}
13247
13248// The set of cc code supported is from
13249// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
13252 .Case("{@cchi}", AArch64CC::HI)
13253 .Case("{@cccs}", AArch64CC::HS)
13254 .Case("{@cclo}", AArch64CC::LO)
13255 .Case("{@ccls}", AArch64CC::LS)
13256 .Case("{@cccc}", AArch64CC::LO)
13257 .Case("{@cceq}", AArch64CC::EQ)
13258 .Case("{@ccgt}", AArch64CC::GT)
13259 .Case("{@ccge}", AArch64CC::GE)
13260 .Case("{@cclt}", AArch64CC::LT)
13261 .Case("{@ccle}", AArch64CC::LE)
13262 .Case("{@cchs}", AArch64CC::HS)
13263 .Case("{@ccne}", AArch64CC::NE)
13264 .Case("{@ccvc}", AArch64CC::VC)
13265 .Case("{@ccpl}", AArch64CC::PL)
13266 .Case("{@ccvs}", AArch64CC::VS)
13267 .Case("{@ccmi}", AArch64CC::MI)
13269 return Cond;
13270}
13271
13272/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
13273/// WZR, invert(<cond>)'.
13275 SelectionDAG &DAG) {
13276 return DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
13277 DAG.getConstant(0, DL, MVT::i32),
13278 DAG.getConstant(0, DL, MVT::i32),
13279 getCondCode(DAG, getInvertedCondCode(CC)), NZCV);
13280}
13281
13282// Lower @cc flag output via getSETCC.
13283SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
13284 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
13285 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
13286 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
13287 if (Cond == AArch64CC::Invalid)
13288 return SDValue();
13289 // The output variable should be a scalar integer.
13290 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
13291 OpInfo.ConstraintVT.getSizeInBits() < 8)
13292 report_fatal_error("Flag output operand is of invalid type");
13293
13294 // Get NZCV register. Only update chain when copyfrom is glued.
13295 if (Glue.getNode()) {
13296 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue);
13297 Chain = Glue.getValue(1);
13298 } else
13299 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT);
13300 // Extract CC code.
13301 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
13302
13304
13305 // Truncate or ZERO_EXTEND based on value types.
13306 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
13307 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
13308 else
13309 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
13310
13311 return Result;
13312}
13313
13314/// getConstraintType - Given a constraint letter, return the type of
13315/// constraint it is for this target.
13317AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
13318 if (Constraint.size() == 1) {
13319 switch (Constraint[0]) {
13320 default:
13321 break;
13322 case 'x':
13323 case 'w':
13324 case 'y':
13325 return C_RegisterClass;
13326 // An address with a single base register. Due to the way we
13327 // currently handle addresses it is the same as 'r'.
13328 case 'Q':
13329 return C_Memory;
13330 case 'I':
13331 case 'J':
13332 case 'K':
13333 case 'L':
13334 case 'M':
13335 case 'N':
13336 case 'Y':
13337 case 'Z':
13338 return C_Immediate;
13339 case 'z':
13340 case 'S': // A symbol or label reference with a constant offset
13341 return C_Other;
13342 }
13343 } else if (parsePredicateConstraint(Constraint))
13344 return C_RegisterClass;
13345 else if (parseReducedGprConstraint(Constraint))
13346 return C_RegisterClass;
13347 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
13348 return C_Other;
13349 return TargetLowering::getConstraintType(Constraint);
13350}
13351
13352/// Examine constraint type and operand type and determine a weight value.
13353/// This object must already have been set up with the operand type
13354/// and the current alternative constraint selected.
13356AArch64TargetLowering::getSingleConstraintMatchWeight(
13357 AsmOperandInfo &info, const char *constraint) const {
13359 Value *CallOperandVal = info.CallOperandVal;
13360 // If we don't have a value, we can't do a match,
13361 // but allow it at the lowest weight.
13362 if (!CallOperandVal)
13363 return CW_Default;
13364 Type *type = CallOperandVal->getType();
13365 // Look at the constraint type.
13366 switch (*constraint) {
13367 default:
13369 break;
13370 case 'x':
13371 case 'w':
13372 case 'y':
13373 if (type->isFloatingPointTy() || type->isVectorTy())
13374 weight = CW_Register;
13375 break;
13376 case 'z':
13377 weight = CW_Constant;
13378 break;
13379 case 'U':
13380 if (parsePredicateConstraint(constraint) ||
13381 parseReducedGprConstraint(constraint))
13382 weight = CW_Register;
13383 break;
13384 }
13385 return weight;
13386}
13387
13388std::pair<unsigned, const TargetRegisterClass *>
13389AArch64TargetLowering::getRegForInlineAsmConstraint(
13390 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
13391 if (Constraint.size() == 1) {
13392 switch (Constraint[0]) {
13393 case 'r':
13394 if (VT.isScalableVector())
13395 return std::make_pair(0U, nullptr);
13396 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
13397 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
13398 if (VT.getFixedSizeInBits() == 64)
13399 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
13400 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
13401 case 'w': {
13402 if (!Subtarget->hasFPARMv8())
13403 break;
13404 if (VT.isScalableVector()) {
13405 if (VT.getVectorElementType() != MVT::i1)
13406 return std::make_pair(0U, &AArch64::ZPRRegClass);
13407 return std::make_pair(0U, nullptr);
13408 }
13409 if (VT == MVT::Other)
13410 break;
13411 uint64_t VTSize = VT.getFixedSizeInBits();
13412 if (VTSize == 16)
13413 return std::make_pair(0U, &AArch64::FPR16RegClass);
13414 if (VTSize == 32)
13415 return std::make_pair(0U, &AArch64::FPR32RegClass);
13416 if (VTSize == 64)
13417 return std::make_pair(0U, &AArch64::FPR64RegClass);
13418 if (VTSize == 128)
13419 return std::make_pair(0U, &AArch64::FPR128RegClass);
13420 break;
13421 }
13422 // The instructions that this constraint is designed for can
13423 // only take 128-bit registers so just use that regclass.
13424 case 'x':
13425 if (!Subtarget->hasFPARMv8())
13426 break;
13427 if (VT.isScalableVector())
13428 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
13429 if (VT.getSizeInBits() == 128)
13430 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
13431 break;
13432 case 'y':
13433 if (!Subtarget->hasFPARMv8())
13434 break;
13435 if (VT.isScalableVector())
13436 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
13437 break;
13438 }
13439 } else {
13440 if (const auto P = parseSVERegAsConstraint(Constraint)) {
13441 // SME functions that are not in streaming mode, should
13442 // still observe clobbers of Z-registers by clobbering
13443 // the lower 128bits of those registers.
13444 if (AArch64::ZPRRegClass.hasSubClassEq(P->second) &&
13445 !Subtarget->isSVEorStreamingSVEAvailable())
13446 return std::make_pair(TRI->getSubReg(P->first, AArch64::zsub),
13447 &AArch64::FPR128RegClass);
13448 return *P;
13449 }
13450 if (const auto PC = parsePredicateConstraint(Constraint))
13451 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
13452 return std::make_pair(0U, RegClass);
13453
13454 if (const auto RGC = parseReducedGprConstraint(Constraint))
13455 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
13456 return std::make_pair(0U, RegClass);
13457 }
13458 if (StringRef("{cc}").equals_insensitive(Constraint) ||
13460 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
13461
13462 if (Constraint == "{za}") {
13463 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
13464 }
13465
13466 if (Constraint == "{zt0}") {
13467 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
13468 }
13469
13470 // Use the default implementation in TargetLowering to convert the register
13471 // constraint into a member of a register class.
13472 std::pair<unsigned, const TargetRegisterClass *> Res;
13474
13475 // Not found as a standard register?
13476 if (!Res.second) {
13477 unsigned Size = Constraint.size();
13478 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
13479 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
13480 int RegNo;
13481 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
13482 if (!Failed && RegNo >= 0 && RegNo <= 31) {
13483 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
13484 // By default we'll emit v0-v31 for this unless there's a modifier where
13485 // we'll emit the correct register as well.
13486 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
13487 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
13488 Res.second = &AArch64::FPR64RegClass;
13489 } else {
13490 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
13491 Res.second = &AArch64::FPR128RegClass;
13492 }
13493 }
13494 }
13495 }
13496
13497 if (Res.second && !Subtarget->hasFPARMv8() &&
13498 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
13499 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
13500 return std::make_pair(0U, nullptr);
13501
13502 return Res;
13503}
13504
13506 llvm::Type *Ty,
13507 bool AllowUnknown) const {
13508 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
13509 return EVT(MVT::i64x8);
13510
13511 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
13512}
13513
13514/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
13515/// vector. If it is invalid, don't add anything to Ops.
13516void AArch64TargetLowering::LowerAsmOperandForConstraint(
13517 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
13518 SelectionDAG &DAG) const {
13519 SDValue Result;
13520
13521 // Currently only support length 1 constraints.
13522 if (Constraint.size() != 1)
13523 return;
13524
13525 char ConstraintLetter = Constraint[0];
13526 switch (ConstraintLetter) {
13527 default:
13528 break;
13529
13530 // This set of constraints deal with valid constants for various instructions.
13531 // Validate and return a target constant for them if we can.
13532 case 'z': {
13533 // 'z' maps to xzr or wzr so it needs an input of 0.
13534 if (!isNullConstant(Op))
13535 return;
13536
13537 if (Op.getValueType() == MVT::i64)
13538 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
13539 else
13540 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
13541 break;
13542 }
13543 case 'S':
13544 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
13545 // supported for PIC while "s" isn't, making "s" less useful. We implement
13546 // "S" but not "s".
13548 break;
13549
13550 case 'I':
13551 case 'J':
13552 case 'K':
13553 case 'L':
13554 case 'M':
13555 case 'N':
13557 if (!C)
13558 return;
13559
13560 // Grab the value and do some validation.
13561 uint64_t CVal = C->getZExtValue();
13562 switch (ConstraintLetter) {
13563 // The I constraint applies only to simple ADD or SUB immediate operands:
13564 // i.e. 0 to 4095 with optional shift by 12
13565 // The J constraint applies only to ADD or SUB immediates that would be
13566 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
13567 // instruction [or vice versa], in other words -1 to -4095 with optional
13568 // left shift by 12.
13569 case 'I':
13570 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
13571 break;
13572 return;
13573 case 'J': {
13574 uint64_t NVal = -C->getSExtValue();
13575 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
13576 CVal = C->getSExtValue();
13577 break;
13578 }
13579 return;
13580 }
13581 // The K and L constraints apply *only* to logical immediates, including
13582 // what used to be the MOVI alias for ORR (though the MOVI alias has now
13583 // been removed and MOV should be used). So these constraints have to
13584 // distinguish between bit patterns that are valid 32-bit or 64-bit
13585 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
13586 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
13587 // versa.
13588 case 'K':
13589 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13590 break;
13591 return;
13592 case 'L':
13593 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13594 break;
13595 return;
13596 // The M and N constraints are a superset of K and L respectively, for use
13597 // with the MOV (immediate) alias. As well as the logical immediates they
13598 // also match 32 or 64-bit immediates that can be loaded either using a
13599 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
13600 // (M) or 64-bit 0x1234000000000000 (N) etc.
13601 // As a note some of this code is liberally stolen from the asm parser.
13602 case 'M': {
13603 if (!isUInt<32>(CVal))
13604 return;
13605 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13606 break;
13607 if ((CVal & 0xFFFF) == CVal)
13608 break;
13609 if ((CVal & 0xFFFF0000ULL) == CVal)
13610 break;
13611 uint64_t NCVal = ~(uint32_t)CVal;
13612 if ((NCVal & 0xFFFFULL) == NCVal)
13613 break;
13614 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13615 break;
13616 return;
13617 }
13618 case 'N': {
13619 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13620 break;
13621 if ((CVal & 0xFFFFULL) == CVal)
13622 break;
13623 if ((CVal & 0xFFFF0000ULL) == CVal)
13624 break;
13625 if ((CVal & 0xFFFF00000000ULL) == CVal)
13626 break;
13627 if ((CVal & 0xFFFF000000000000ULL) == CVal)
13628 break;
13629 uint64_t NCVal = ~CVal;
13630 if ((NCVal & 0xFFFFULL) == NCVal)
13631 break;
13632 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13633 break;
13634 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
13635 break;
13636 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
13637 break;
13638 return;
13639 }
13640 default:
13641 return;
13642 }
13643
13644 // All assembler immediates are 64-bit integers.
13645 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
13646 break;
13647 }
13648
13649 if (Result.getNode()) {
13650 Ops.push_back(Result);
13651 return;
13652 }
13653
13654 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
13655}
13656
13657//===----------------------------------------------------------------------===//
13658// AArch64 Advanced SIMD Support
13659//===----------------------------------------------------------------------===//
13660
13661/// WidenVector - Given a value in the V64 register class, produce the
13662/// equivalent value in the V128 register class.
13664 EVT VT = V64Reg.getValueType();
13665 unsigned NarrowSize = VT.getVectorNumElements();
13666 MVT EltTy = VT.getVectorElementType().getSimpleVT();
13667 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
13668 SDLoc DL(V64Reg);
13669
13670 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
13671 V64Reg, DAG.getConstant(0, DL, MVT::i64));
13672}
13673
13674/// getExtFactor - Determine the adjustment factor for the position when
13675/// generating an "extract from vector registers" instruction.
13676static unsigned getExtFactor(SDValue &V) {
13677 EVT EltType = V.getValueType().getVectorElementType();
13678 return EltType.getSizeInBits() / 8;
13679}
13680
13681// Check if a vector is built from one vector via extracted elements of
13682// another together with an AND mask, ensuring that all elements fit
13683// within range. This can be reconstructed using AND and NEON's TBL1.
13685 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13686 SDLoc DL(Op);
13687 EVT VT = Op.getValueType();
13688 assert(!VT.isScalableVector() &&
13689 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13690
13691 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
13692 // directly to TBL1.
13693 if (VT != MVT::v16i8 && VT != MVT::v8i8)
13694 return SDValue();
13695
13696 unsigned NumElts = VT.getVectorNumElements();
13697 assert((NumElts == 8 || NumElts == 16) &&
13698 "Need to have exactly 8 or 16 elements in vector.");
13699
13700 SDValue SourceVec;
13701 SDValue MaskSourceVec;
13702 SmallVector<SDValue, 16> AndMaskConstants;
13703
13704 for (unsigned i = 0; i < NumElts; ++i) {
13705 SDValue V = Op.getOperand(i);
13706 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13707 return SDValue();
13708
13709 SDValue OperandSourceVec = V.getOperand(0);
13710 if (!SourceVec)
13711 SourceVec = OperandSourceVec;
13712 else if (SourceVec != OperandSourceVec)
13713 return SDValue();
13714
13715 // This only looks at shuffles with elements that are
13716 // a) truncated by a constant AND mask extracted from a mask vector, or
13717 // b) extracted directly from a mask vector.
13718 SDValue MaskSource = V.getOperand(1);
13719 if (MaskSource.getOpcode() == ISD::AND) {
13720 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
13721 return SDValue();
13722
13723 AndMaskConstants.push_back(MaskSource.getOperand(1));
13724 MaskSource = MaskSource->getOperand(0);
13725 } else if (!AndMaskConstants.empty()) {
13726 // Either all or no operands should have an AND mask.
13727 return SDValue();
13728 }
13729
13730 // An ANY_EXTEND may be inserted between the AND and the source vector
13731 // extraction. We don't care about that, so we can just skip it.
13732 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
13733 MaskSource = MaskSource.getOperand(0);
13734
13735 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13736 return SDValue();
13737
13738 SDValue MaskIdx = MaskSource.getOperand(1);
13739 if (!isa<ConstantSDNode>(MaskIdx) ||
13740 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
13741 return SDValue();
13742
13743 // We only apply this if all elements come from the same vector with the
13744 // same vector type.
13745 if (!MaskSourceVec) {
13746 MaskSourceVec = MaskSource->getOperand(0);
13747 if (MaskSourceVec.getValueType() != VT)
13748 return SDValue();
13749 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
13750 return SDValue();
13751 }
13752 }
13753
13754 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
13755 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
13756 // insert, we know that the index in the mask must be smaller than the number
13757 // of elements in the source, or we would have an out-of-bounds access.
13758 if (NumElts == 8)
13759 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, SourceVec,
13760 DAG.getUNDEF(VT));
13761
13762 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
13763 if (!AndMaskConstants.empty())
13764 MaskSourceVec = DAG.getNode(ISD::AND, DL, VT, MaskSourceVec,
13765 DAG.getBuildVector(VT, DL, AndMaskConstants));
13766
13767 return DAG.getNode(
13769 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
13770 SourceVec, MaskSourceVec);
13771}
13772
13773// Gather data to see if the operation can be modelled as a
13774// shuffle in combination with VEXTs.
13776 SelectionDAG &DAG) const {
13777 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13778 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
13779 SDLoc DL(Op);
13780 EVT VT = Op.getValueType();
13781 assert(!VT.isScalableVector() &&
13782 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13783 unsigned NumElts = VT.getVectorNumElements();
13784
13785 struct ShuffleSourceInfo {
13786 SDValue Vec;
13787 unsigned MinElt;
13788 unsigned MaxElt;
13789
13790 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
13791 // be compatible with the shuffle we intend to construct. As a result
13792 // ShuffleVec will be some sliding window into the original Vec.
13793 SDValue ShuffleVec;
13794
13795 // Code should guarantee that element i in Vec starts at element "WindowBase
13796 // + i * WindowScale in ShuffleVec".
13797 int WindowBase;
13798 int WindowScale;
13799
13800 ShuffleSourceInfo(SDValue Vec)
13801 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
13802 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
13803
13804 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
13805 };
13806
13807 // First gather all vectors used as an immediate source for this BUILD_VECTOR
13808 // node.
13810 for (unsigned i = 0; i < NumElts; ++i) {
13811 SDValue V = Op.getOperand(i);
13812 if (V.isUndef())
13813 continue;
13814 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13815 !isa<ConstantSDNode>(V.getOperand(1)) ||
13816 V.getOperand(0).getValueType().isScalableVector()) {
13817 LLVM_DEBUG(
13818 dbgs() << "Reshuffle failed: "
13819 "a shuffle can only come from building a vector from "
13820 "various elements of other fixed-width vectors, provided "
13821 "their indices are constant\n");
13822 return SDValue();
13823 }
13824
13825 // Add this element source to the list if it's not already there.
13826 SDValue SourceVec = V.getOperand(0);
13827 auto Source = find(Sources, SourceVec);
13828 if (Source == Sources.end())
13829 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
13830
13831 // Update the minimum and maximum lane number seen.
13832 unsigned EltNo = V.getConstantOperandVal(1);
13833 Source->MinElt = std::min(Source->MinElt, EltNo);
13834 Source->MaxElt = std::max(Source->MaxElt, EltNo);
13835 }
13836
13837 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
13838 // better than moving to/from gpr registers for larger vectors.
13839 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
13840 // Construct a mask for the tbl. We may need to adjust the index for types
13841 // larger than i8.
13843 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
13844 for (unsigned I = 0; I < NumElts; ++I) {
13845 SDValue V = Op.getOperand(I);
13846 if (V.isUndef()) {
13847 for (unsigned OF = 0; OF < OutputFactor; OF++)
13848 Mask.push_back(-1);
13849 continue;
13850 }
13851 // Set the Mask lanes adjusted for the size of the input and output
13852 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
13853 // output element, adjusted in their positions per input and output types.
13854 unsigned Lane = V.getConstantOperandVal(1);
13855 for (unsigned S = 0; S < Sources.size(); S++) {
13856 if (V.getOperand(0) == Sources[S].Vec) {
13857 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
13858 unsigned InputBase = 16 * S + Lane * InputSize / 8;
13859 for (unsigned OF = 0; OF < OutputFactor; OF++)
13860 Mask.push_back(InputBase + OF);
13861 break;
13862 }
13863 }
13864 }
13865
13866 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
13867 // v16i8, and the TBLMask
13868 SmallVector<SDValue, 16> TBLOperands;
13869 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
13870 ? Intrinsic::aarch64_neon_tbl3
13871 : Intrinsic::aarch64_neon_tbl4,
13872 DL, MVT::i32));
13873 for (unsigned i = 0; i < Sources.size(); i++) {
13874 SDValue Src = Sources[i].Vec;
13875 EVT SrcVT = Src.getValueType();
13876 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
13877 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
13878 "Expected a legally typed vector");
13879 if (SrcVT.is64BitVector())
13880 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Src,
13881 DAG.getUNDEF(MVT::v8i8));
13882 TBLOperands.push_back(Src);
13883 }
13884
13886 for (unsigned i = 0; i < Mask.size(); i++)
13887 TBLMask.push_back(DAG.getConstant(Mask[i], DL, MVT::i32));
13888 assert((Mask.size() == 8 || Mask.size() == 16) &&
13889 "Expected a v8i8 or v16i8 Mask");
13890 TBLOperands.push_back(DAG.getBuildVector(
13891 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, DL, TBLMask));
13892
13893 SDValue Shuffle =
13895 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
13896 return DAG.getBitcast(VT, Shuffle);
13897 }
13898
13899 if (Sources.size() > 2) {
13900 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
13901 << "sensible when at most two source vectors are "
13902 << "involved\n");
13903 return SDValue();
13904 }
13905
13906 // Find out the smallest element size among result and two sources, and use
13907 // it as element size to build the shuffle_vector.
13908 EVT SmallestEltTy = VT.getVectorElementType();
13909 for (auto &Source : Sources) {
13910 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
13911 if (SrcEltTy.bitsLT(SmallestEltTy)) {
13912 SmallestEltTy = SrcEltTy;
13913 }
13914 }
13915 unsigned ResMultiplier =
13916 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13917 uint64_t VTSize = VT.getFixedSizeInBits();
13918 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
13919 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
13920
13921 // If the source vector is too wide or too narrow, we may nevertheless be able
13922 // to construct a compatible shuffle either by concatenating it with UNDEF or
13923 // extracting a suitable range of elements.
13924 for (auto &Src : Sources) {
13925 EVT SrcVT = Src.ShuffleVec.getValueType();
13926
13927 TypeSize SrcVTSize = SrcVT.getSizeInBits();
13928 if (SrcVTSize == TypeSize::getFixed(VTSize))
13929 continue;
13930
13931 // This stage of the search produces a source with the same element type as
13932 // the original, but with a total width matching the BUILD_VECTOR output.
13933 EVT EltVT = SrcVT.getVectorElementType();
13934 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
13935 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
13936
13937 if (SrcVTSize.getFixedValue() < VTSize) {
13938 assert(2 * SrcVTSize == VTSize);
13939 // We can pad out the smaller vector for free, so if it's part of a
13940 // shuffle...
13941 Src.ShuffleVec =
13942 DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Src.ShuffleVec,
13943 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
13944 continue;
13945 }
13946
13947 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
13948 LLVM_DEBUG(
13949 dbgs() << "Reshuffle failed: result vector too small to extract\n");
13950 return SDValue();
13951 }
13952
13953 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
13954 LLVM_DEBUG(
13955 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
13956 return SDValue();
13957 }
13958
13959 if (Src.MinElt >= NumSrcElts) {
13960 // The extraction can just take the second half
13961 Src.ShuffleVec =
13962 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13963 DAG.getConstant(NumSrcElts, DL, MVT::i64));
13964 Src.WindowBase = -NumSrcElts;
13965 } else if (Src.MaxElt < NumSrcElts) {
13966 // The extraction can just take the first half
13967 Src.ShuffleVec =
13968 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13969 DAG.getConstant(0, DL, MVT::i64));
13970 } else {
13971 // An actual VEXT is needed
13972 SDValue VEXTSrc1 =
13973 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13974 DAG.getConstant(0, DL, MVT::i64));
13975 SDValue VEXTSrc2 =
13976 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13977 DAG.getConstant(NumSrcElts, DL, MVT::i64));
13978 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
13979
13980 if (!SrcVT.is64BitVector()) {
13981 LLVM_DEBUG(
13982 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
13983 "for SVE vectors.");
13984 return SDValue();
13985 }
13986
13987 Src.ShuffleVec =
13988 DAG.getNode(AArch64ISD::EXT, DL, DestVT, VEXTSrc1, VEXTSrc2,
13989 DAG.getConstant(Imm, DL, MVT::i32));
13990 Src.WindowBase = -Src.MinElt;
13991 }
13992 }
13993
13994 // Another possible incompatibility occurs from the vector element types. We
13995 // can fix this by bitcasting the source vectors to the same type we intend
13996 // for the shuffle.
13997 for (auto &Src : Sources) {
13998 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
13999 if (SrcEltTy == SmallestEltTy)
14000 continue;
14001 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
14002 if (DAG.getDataLayout().isBigEndian()) {
14003 Src.ShuffleVec =
14004 DAG.getNode(AArch64ISD::NVCAST, DL, ShuffleVT, Src.ShuffleVec);
14005 } else {
14006 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Src.ShuffleVec);
14007 }
14008 Src.WindowScale =
14009 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
14010 Src.WindowBase *= Src.WindowScale;
14011 }
14012
14013 // Final check before we try to actually produce a shuffle.
14014 LLVM_DEBUG({
14015 for (auto Src : Sources)
14016 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
14017 });
14018
14019 // The stars all align, our next step is to produce the mask for the shuffle.
14020 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
14021 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
14022 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
14023 SDValue Entry = Op.getOperand(i);
14024 if (Entry.isUndef())
14025 continue;
14026
14027 auto Src = find(Sources, Entry.getOperand(0));
14028 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
14029
14030 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
14031 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
14032 // segment.
14033 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
14034 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
14035 VT.getScalarSizeInBits());
14036 int LanesDefined = BitsDefined / BitsPerShuffleLane;
14037
14038 // This source is expected to fill ResMultiplier lanes of the final shuffle,
14039 // starting at the appropriate offset.
14040 int *LaneMask = &Mask[i * ResMultiplier];
14041
14042 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
14043 ExtractBase += NumElts * (Src - Sources.begin());
14044 for (int j = 0; j < LanesDefined; ++j)
14045 LaneMask[j] = ExtractBase + j;
14046 }
14047
14048 // Final check before we try to produce nonsense...
14049 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
14050 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
14051 return SDValue();
14052 }
14053
14054 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
14055 for (unsigned i = 0; i < Sources.size(); ++i)
14056 ShuffleOps[i] = Sources[i].ShuffleVec;
14057
14058 SDValue Shuffle =
14059 DAG.getVectorShuffle(ShuffleVT, DL, ShuffleOps[0], ShuffleOps[1], Mask);
14060 SDValue V;
14061 if (DAG.getDataLayout().isBigEndian()) {
14062 V = DAG.getNode(AArch64ISD::NVCAST, DL, VT, Shuffle);
14063 } else {
14064 V = DAG.getNode(ISD::BITCAST, DL, VT, Shuffle);
14065 }
14066
14067 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
14068 dbgs() << "Reshuffle, creating node: "; V.dump(););
14069
14070 return V;
14071}
14072
14073// check if an EXT instruction can handle the shuffle mask when the
14074// vector sources of the shuffle are the same.
14075static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
14076 unsigned NumElts = VT.getVectorNumElements();
14077
14078 // Assume that the first shuffle index is not UNDEF. Fail if it is.
14079 if (M[0] < 0)
14080 return false;
14081
14082 Imm = M[0];
14083
14084 // If this is a VEXT shuffle, the immediate value is the index of the first
14085 // element. The other shuffle indices must be the successive elements after
14086 // the first one.
14087 unsigned ExpectedElt = Imm;
14088 for (unsigned i = 1; i < NumElts; ++i) {
14089 // Increment the expected index. If it wraps around, just follow it
14090 // back to index zero and keep going.
14091 ++ExpectedElt;
14092 if (ExpectedElt == NumElts)
14093 ExpectedElt = 0;
14094
14095 if (M[i] < 0)
14096 continue; // ignore UNDEF indices
14097 if (ExpectedElt != static_cast<unsigned>(M[i]))
14098 return false;
14099 }
14100
14101 return true;
14102}
14103
14104// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
14105// v4i32s. This is really a truncate, which we can construct out of (legal)
14106// concats and truncate nodes.
14108 if (V.getValueType() != MVT::v16i8)
14109 return SDValue();
14110 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
14111
14112 for (unsigned X = 0; X < 4; X++) {
14113 // Check the first item in each group is an extract from lane 0 of a v4i32
14114 // or v4i16.
14115 SDValue BaseExt = V.getOperand(X * 4);
14116 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14117 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
14118 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
14119 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
14120 BaseExt.getConstantOperandVal(1) != 0)
14121 return SDValue();
14122 SDValue Base = BaseExt.getOperand(0);
14123 // And check the other items are extracts from the same vector.
14124 for (unsigned Y = 1; Y < 4; Y++) {
14125 SDValue Ext = V.getOperand(X * 4 + Y);
14126 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14127 Ext.getOperand(0) != Base ||
14129 Ext.getConstantOperandVal(1) != Y)
14130 return SDValue();
14131 }
14132 }
14133
14134 // Turn the buildvector into a series of truncates and concates, which will
14135 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
14136 // concat together to produce 2 v8i16. These are both truncated and concat
14137 // together.
14138 SDLoc DL(V);
14139 SDValue Trunc[4] = {
14140 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
14141 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
14142 for (SDValue &V : Trunc)
14143 if (V.getValueType() == MVT::v4i32)
14144 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
14145 SDValue Concat0 =
14146 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
14147 SDValue Concat1 =
14148 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
14149 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
14150 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
14151 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
14152}
14153
14154/// Check if a vector shuffle corresponds to a DUP instructions with a larger
14155/// element width than the vector lane type. If that is the case the function
14156/// returns true and writes the value of the DUP instruction lane operand into
14157/// DupLaneOp
14158static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
14159 unsigned &DupLaneOp) {
14160 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
14161 "Only possible block sizes for wide DUP are: 16, 32, 64");
14162
14163 if (BlockSize <= VT.getScalarSizeInBits())
14164 return false;
14165 if (BlockSize % VT.getScalarSizeInBits() != 0)
14166 return false;
14167 if (VT.getSizeInBits() % BlockSize != 0)
14168 return false;
14169
14170 size_t SingleVecNumElements = VT.getVectorNumElements();
14171 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
14172 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
14173
14174 // We are looking for masks like
14175 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
14176 // might be replaced by 'undefined'. BlockIndices will eventually contain
14177 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
14178 // for the above examples)
14179 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
14180 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
14181 for (size_t I = 0; I < NumEltsPerBlock; I++) {
14182 int Elt = M[BlockIndex * NumEltsPerBlock + I];
14183 if (Elt < 0)
14184 continue;
14185 // For now we don't support shuffles that use the second operand
14186 if ((unsigned)Elt >= SingleVecNumElements)
14187 return false;
14188 if (BlockElts[I] < 0)
14189 BlockElts[I] = Elt;
14190 else if (BlockElts[I] != Elt)
14191 return false;
14192 }
14193
14194 // We found a candidate block (possibly with some undefs). It must be a
14195 // sequence of consecutive integers starting with a value divisible by
14196 // NumEltsPerBlock with some values possibly replaced by undef-s.
14197
14198 // Find first non-undef element
14199 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
14200 assert(FirstRealEltIter != BlockElts.end() &&
14201 "Shuffle with all-undefs must have been caught by previous cases, "
14202 "e.g. isSplat()");
14203 if (FirstRealEltIter == BlockElts.end()) {
14204 DupLaneOp = 0;
14205 return true;
14206 }
14207
14208 // Index of FirstRealElt in BlockElts
14209 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
14210
14211 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
14212 return false;
14213 // BlockElts[0] must have the following value if it isn't undef:
14214 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
14215
14216 // Check the first element
14217 if (Elt0 % NumEltsPerBlock != 0)
14218 return false;
14219 // Check that the sequence indeed consists of consecutive integers (modulo
14220 // undefs)
14221 for (size_t I = 0; I < NumEltsPerBlock; I++)
14222 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
14223 return false;
14224
14225 DupLaneOp = Elt0 / NumEltsPerBlock;
14226 return true;
14227}
14228
14229// check if an EXT instruction can handle the shuffle mask when the
14230// vector sources of the shuffle are different.
14231static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
14232 unsigned &Imm) {
14233 // Look for the first non-undef element.
14234 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
14235
14236 // Benefit from APInt to handle overflow when calculating expected element.
14237 unsigned NumElts = VT.getVectorNumElements();
14238 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
14239 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
14240 /*implicitTrunc=*/true);
14241 // The following shuffle indices must be the successive elements after the
14242 // first real element.
14243 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
14244 return Elt != ExpectedElt++ && Elt >= 0;
14245 });
14246 if (FoundWrongElt)
14247 return false;
14248
14249 // The index of an EXT is the first element if it is not UNDEF.
14250 // Watch out for the beginning UNDEFs. The EXT index should be the expected
14251 // value of the first element. E.g.
14252 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
14253 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
14254 // ExpectedElt is the last mask index plus 1.
14255 Imm = ExpectedElt.getZExtValue();
14256
14257 // There are two difference cases requiring to reverse input vectors.
14258 // For example, for vector <4 x i32> we have the following cases,
14259 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
14260 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
14261 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
14262 // to reverse two input vectors.
14263 if (Imm < NumElts)
14264 ReverseEXT = true;
14265 else
14266 Imm -= NumElts;
14267
14268 return true;
14269}
14270
14271/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
14272/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14273/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
14274static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14275 unsigned NumElts = VT.getVectorNumElements();
14276 if (NumElts % 2 != 0)
14277 return false;
14278 WhichResult = (M[0] == 0 ? 0 : 1);
14279 unsigned Idx = WhichResult * NumElts / 2;
14280 for (unsigned i = 0; i != NumElts; i += 2) {
14281 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
14282 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
14283 return false;
14284 Idx += 1;
14285 }
14286
14287 return true;
14288}
14289
14290/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
14291/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14292/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
14293static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14294 unsigned Half = VT.getVectorNumElements() / 2;
14295 WhichResult = (M[0] == 0 ? 0 : 1);
14296 for (unsigned j = 0; j != 2; ++j) {
14297 unsigned Idx = WhichResult;
14298 for (unsigned i = 0; i != Half; ++i) {
14299 int MIdx = M[i + j * Half];
14300 if (MIdx >= 0 && (unsigned)MIdx != Idx)
14301 return false;
14302 Idx += 2;
14303 }
14304 }
14305
14306 return true;
14307}
14308
14309/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
14310/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14311/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
14312static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14313 unsigned NumElts = VT.getVectorNumElements();
14314 if (NumElts % 2 != 0)
14315 return false;
14316 WhichResult = (M[0] == 0 ? 0 : 1);
14317 for (unsigned i = 0; i < NumElts; i += 2) {
14318 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
14319 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
14320 return false;
14321 }
14322 return true;
14323}
14324
14325static bool isINSMask(ArrayRef<int> M, int NumInputElements,
14326 bool &DstIsLeft, int &Anomaly) {
14327 if (M.size() != static_cast<size_t>(NumInputElements))
14328 return false;
14329
14330 int NumLHSMatch = 0, NumRHSMatch = 0;
14331 int LastLHSMismatch = -1, LastRHSMismatch = -1;
14332
14333 for (int i = 0; i < NumInputElements; ++i) {
14334 if (M[i] == -1) {
14335 ++NumLHSMatch;
14336 ++NumRHSMatch;
14337 continue;
14338 }
14339
14340 if (M[i] == i)
14341 ++NumLHSMatch;
14342 else
14343 LastLHSMismatch = i;
14344
14345 if (M[i] == i + NumInputElements)
14346 ++NumRHSMatch;
14347 else
14348 LastRHSMismatch = i;
14349 }
14350
14351 if (NumLHSMatch == NumInputElements - 1) {
14352 DstIsLeft = true;
14353 Anomaly = LastLHSMismatch;
14354 return true;
14355 } else if (NumRHSMatch == NumInputElements - 1) {
14356 DstIsLeft = false;
14357 Anomaly = LastRHSMismatch;
14358 return true;
14359 }
14360
14361 return false;
14362}
14363
14364static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
14365 if (VT.getSizeInBits() != 128)
14366 return false;
14367
14368 unsigned NumElts = VT.getVectorNumElements();
14369
14370 for (int I = 0, E = NumElts / 2; I != E; I++) {
14371 if (Mask[I] != I)
14372 return false;
14373 }
14374
14375 int Offset = NumElts / 2;
14376 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
14377 if (Mask[I] != I + SplitLHS * Offset)
14378 return false;
14379 }
14380
14381 return true;
14382}
14383
14385 SDLoc DL(Op);
14386 EVT VT = Op.getValueType();
14387 SDValue V0 = Op.getOperand(0);
14388 SDValue V1 = Op.getOperand(1);
14389 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14390
14393 return SDValue();
14394
14395 bool SplitV0 = V0.getValueSizeInBits() == 128;
14396
14397 if (!isConcatMask(Mask, VT, SplitV0))
14398 return SDValue();
14399
14400 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14401 if (SplitV0) {
14402 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
14403 DAG.getConstant(0, DL, MVT::i64));
14404 }
14405 if (V1.getValueSizeInBits() == 128) {
14406 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
14407 DAG.getConstant(0, DL, MVT::i64));
14408 }
14409 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
14410}
14411
14412/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
14413/// the specified operations to build the shuffle. ID is the perfect-shuffle
14414//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
14415//table entry and LHS/RHS are the immediate inputs for this stage of the
14416//shuffle.
14418 unsigned PFEntry, SDValue LHS,
14419 SDValue RHS, SelectionDAG &DAG,
14420 const SDLoc &DL) {
14421 unsigned OpNum = (PFEntry >> 26) & 0x0F;
14422 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
14423 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
14424
14425 enum {
14426 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
14427 OP_VREV,
14428 OP_VDUP0,
14429 OP_VDUP1,
14430 OP_VDUP2,
14431 OP_VDUP3,
14432 OP_VEXT1,
14433 OP_VEXT2,
14434 OP_VEXT3,
14435 OP_VUZPL, // VUZP, left result
14436 OP_VUZPR, // VUZP, right result
14437 OP_VZIPL, // VZIP, left result
14438 OP_VZIPR, // VZIP, right result
14439 OP_VTRNL, // VTRN, left result
14440 OP_VTRNR, // VTRN, right result
14441 OP_MOVLANE // Move lane. RHSID is the lane to move into
14442 };
14443
14444 if (OpNum == OP_COPY) {
14445 if (LHSID == (1 * 9 + 2) * 9 + 3)
14446 return LHS;
14447 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
14448 return RHS;
14449 }
14450
14451 if (OpNum == OP_MOVLANE) {
14452 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
14453 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
14454 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
14455 Elt = 3 - Elt;
14456 while (Elt > 0) {
14457 ID /= 9;
14458 Elt--;
14459 }
14460 return (ID % 9 == 8) ? -1 : ID % 9;
14461 };
14462
14463 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
14464 // get the lane to move from the PFID, which is always from the
14465 // original vectors (V1 or V2).
14467 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, DL);
14468 EVT VT = OpLHS.getValueType();
14469 assert(RHSID < 8 && "Expected a lane index for RHSID!");
14470 unsigned ExtLane = 0;
14471 SDValue Input;
14472
14473 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
14474 // convert into a higher type.
14475 if (RHSID & 0x4) {
14476 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
14477 if (MaskElt == -1)
14478 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
14479 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14480 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
14481 Input = MaskElt < 2 ? V1 : V2;
14482 if (VT.getScalarSizeInBits() == 16) {
14483 Input = DAG.getBitcast(MVT::v2f32, Input);
14484 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
14485 } else {
14486 assert(VT.getScalarSizeInBits() == 32 &&
14487 "Expected 16 or 32 bit shuffle elements");
14488 Input = DAG.getBitcast(MVT::v2f64, Input);
14489 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
14490 }
14491 } else {
14492 int MaskElt = getPFIDLane(ID, RHSID);
14493 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14494 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
14495 Input = MaskElt < 4 ? V1 : V2;
14496 // Be careful about creating illegal types. Use f16 instead of i16.
14497 if (VT == MVT::v4i16) {
14498 Input = DAG.getBitcast(MVT::v4f16, Input);
14499 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
14500 }
14501 }
14503 Input.getValueType().getVectorElementType(),
14504 Input, DAG.getVectorIdxConstant(ExtLane, DL));
14505 SDValue Ins =
14506 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Input.getValueType(), OpLHS,
14507 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, DL));
14508 return DAG.getBitcast(VT, Ins);
14509 }
14510
14511 SDValue OpLHS, OpRHS;
14512 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
14513 RHS, DAG, DL);
14514 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
14515 RHS, DAG, DL);
14516 EVT VT = OpLHS.getValueType();
14517
14518 switch (OpNum) {
14519 default:
14520 llvm_unreachable("Unknown shuffle opcode!");
14521 case OP_VREV:
14522 // VREV divides the vector in half and swaps within the half.
14523 if (VT.getVectorElementType() == MVT::i32 ||
14524 VT.getVectorElementType() == MVT::f32)
14525 return DAG.getNode(AArch64ISD::REV64, DL, VT, OpLHS);
14526 // vrev <4 x i16> -> REV32
14527 if (VT.getVectorElementType() == MVT::i16 ||
14528 VT.getVectorElementType() == MVT::f16 ||
14529 VT.getVectorElementType() == MVT::bf16)
14530 return DAG.getNode(AArch64ISD::REV32, DL, VT, OpLHS);
14531 // vrev <4 x i8> -> REV16
14532 assert(VT.getVectorElementType() == MVT::i8);
14533 return DAG.getNode(AArch64ISD::REV16, DL, VT, OpLHS);
14534 case OP_VDUP0:
14535 case OP_VDUP1:
14536 case OP_VDUP2:
14537 case OP_VDUP3: {
14538 EVT EltTy = VT.getVectorElementType();
14539 unsigned Opcode;
14540 if (EltTy == MVT::i8)
14541 Opcode = AArch64ISD::DUPLANE8;
14542 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
14543 Opcode = AArch64ISD::DUPLANE16;
14544 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
14545 Opcode = AArch64ISD::DUPLANE32;
14546 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
14547 Opcode = AArch64ISD::DUPLANE64;
14548 else
14549 llvm_unreachable("Invalid vector element type?");
14550
14551 if (VT.getSizeInBits() == 64)
14552 OpLHS = WidenVector(OpLHS, DAG);
14553 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, DL, MVT::i64);
14554 return DAG.getNode(Opcode, DL, VT, OpLHS, Lane);
14555 }
14556 case OP_VEXT1:
14557 case OP_VEXT2:
14558 case OP_VEXT3: {
14559 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
14560 return DAG.getNode(AArch64ISD::EXT, DL, VT, OpLHS, OpRHS,
14561 DAG.getConstant(Imm, DL, MVT::i32));
14562 }
14563 case OP_VUZPL:
14564 return DAG.getNode(AArch64ISD::UZP1, DL, VT, OpLHS, OpRHS);
14565 case OP_VUZPR:
14566 return DAG.getNode(AArch64ISD::UZP2, DL, VT, OpLHS, OpRHS);
14567 case OP_VZIPL:
14568 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, OpLHS, OpRHS);
14569 case OP_VZIPR:
14570 return DAG.getNode(AArch64ISD::ZIP2, DL, VT, OpLHS, OpRHS);
14571 case OP_VTRNL:
14572 return DAG.getNode(AArch64ISD::TRN1, DL, VT, OpLHS, OpRHS);
14573 case OP_VTRNR:
14574 return DAG.getNode(AArch64ISD::TRN2, DL, VT, OpLHS, OpRHS);
14575 }
14576}
14577
14579 SelectionDAG &DAG) {
14580 // Check to see if we can use the TBL instruction.
14581 SDValue V1 = Op.getOperand(0);
14582 SDValue V2 = Op.getOperand(1);
14583 SDLoc DL(Op);
14584
14585 EVT EltVT = Op.getValueType().getVectorElementType();
14586 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
14587
14588 bool Swap = false;
14589 if (V1.isUndef() || isZerosVector(V1.getNode())) {
14590 std::swap(V1, V2);
14591 Swap = true;
14592 }
14593
14594 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
14595 // out of range values with 0s. We do need to make sure that any out-of-range
14596 // values are really out-of-range for a v16i8 vector.
14597 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
14598 MVT IndexVT = MVT::v8i8;
14599 unsigned IndexLen = 8;
14600 if (Op.getValueSizeInBits() == 128) {
14601 IndexVT = MVT::v16i8;
14602 IndexLen = 16;
14603 }
14604
14606 for (int Val : ShuffleMask) {
14607 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
14608 unsigned Offset = Byte + Val * BytesPerElt;
14609 if (Swap)
14610 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
14611 if (IsUndefOrZero && Offset >= IndexLen)
14612 Offset = 255;
14613 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
14614 }
14615 }
14616
14617 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
14618 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
14619
14620 SDValue Shuffle;
14621 if (IsUndefOrZero) {
14622 if (IndexLen == 8)
14623 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
14624 Shuffle = DAG.getNode(
14625 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14626 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
14627 V1Cst,
14628 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14629 } else {
14630 if (IndexLen == 8) {
14631 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
14632 Shuffle = DAG.getNode(
14633 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14634 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
14635 V1Cst,
14636 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14637 } else {
14638 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
14639 // cannot currently represent the register constraints on the input
14640 // table registers.
14641 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
14642 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
14643 // IndexLen));
14644 Shuffle = DAG.getNode(
14645 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14646 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32),
14647 V1Cst, V2Cst,
14648 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14649 }
14650 }
14651 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
14652}
14653
14654static unsigned getDUPLANEOp(EVT EltType) {
14655 if (EltType == MVT::i8)
14656 return AArch64ISD::DUPLANE8;
14657 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
14658 return AArch64ISD::DUPLANE16;
14659 if (EltType == MVT::i32 || EltType == MVT::f32)
14660 return AArch64ISD::DUPLANE32;
14661 if (EltType == MVT::i64 || EltType == MVT::f64)
14662 return AArch64ISD::DUPLANE64;
14663
14664 llvm_unreachable("Invalid vector element type?");
14665}
14666
14667static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT,
14668 unsigned Opcode, SelectionDAG &DAG) {
14669 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
14670 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
14671 // Match: dup (bitcast (extract_subv X, C)), LaneC
14672 if (BitCast.getOpcode() != ISD::BITCAST ||
14674 return false;
14675
14676 // The extract index must align in the destination type. That may not
14677 // happen if the bitcast is from narrow to wide type.
14678 SDValue Extract = BitCast.getOperand(0);
14679 unsigned ExtIdx = Extract.getConstantOperandVal(1);
14680 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
14681 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
14682 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
14683 if (ExtIdxInBits % CastedEltBitWidth != 0)
14684 return false;
14685
14686 // Can't handle cases where vector size is not 128-bit
14687 if (!Extract.getOperand(0).getValueType().is128BitVector())
14688 return false;
14689
14690 // Update the lane value by offsetting with the scaled extract index.
14691 LaneC += ExtIdxInBits / CastedEltBitWidth;
14692
14693 // Determine the casted vector type of the wide vector input.
14694 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
14695 // Examples:
14696 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
14697 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
14698 unsigned SrcVecNumElts =
14699 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
14701 SrcVecNumElts);
14702 return true;
14703 };
14704 MVT CastVT;
14705 if (getScaledOffsetDup(V, Lane, CastVT)) {
14706 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
14707 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
14708 V.getOperand(0).getValueType().is128BitVector()) {
14709 // The lane is incremented by the index of the extract.
14710 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
14711 Lane += V.getConstantOperandVal(1);
14712 V = V.getOperand(0);
14713 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
14714 // The lane is decremented if we are splatting from the 2nd operand.
14715 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
14716 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
14717 Lane -= Idx * VT.getVectorNumElements() / 2;
14718 V = WidenVector(V.getOperand(Idx), DAG);
14719 } else if (VT.getSizeInBits() == 64) {
14720 // Widen the operand to 128-bit register with undef.
14721 V = WidenVector(V, DAG);
14722 }
14723 return DAG.getNode(Opcode, DL, VT, V, DAG.getConstant(Lane, DL, MVT::i64));
14724}
14725
14726// Try to widen element type to get a new mask value for a better permutation
14727// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
14728// UZP1/2, TRN1/2, REV, INS, etc.
14729// For example:
14730// shufflevector <4 x i32> %a, <4 x i32> %b,
14731// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
14732// is equivalent to:
14733// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
14734// Finally, we can get:
14735// mov v0.d[0], v1.d[1]
14737 SDLoc DL(Op);
14738 EVT VT = Op.getValueType();
14739 EVT ScalarVT = VT.getVectorElementType();
14740 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
14741 SDValue V0 = Op.getOperand(0);
14742 SDValue V1 = Op.getOperand(1);
14743 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14744
14745 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
14746 // We need to make sure the wider element type is legal. Thus, ElementSize
14747 // should be not larger than 32 bits, and i1 type should also be excluded.
14748 if (ElementSize > 32 || ElementSize == 1)
14749 return SDValue();
14750
14751 SmallVector<int, 8> NewMask;
14752 if (widenShuffleMaskElts(Mask, NewMask)) {
14753 MVT NewEltVT = VT.isFloatingPoint()
14754 ? MVT::getFloatingPointVT(ElementSize * 2)
14755 : MVT::getIntegerVT(ElementSize * 2);
14756 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14757 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14758 V0 = DAG.getBitcast(NewVT, V0);
14759 V1 = DAG.getBitcast(NewVT, V1);
14760 return DAG.getBitcast(VT,
14761 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
14762 }
14763 }
14764
14765 return SDValue();
14766}
14767
14768// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
14770 ArrayRef<int> ShuffleMask,
14771 SelectionDAG &DAG) {
14772 SDValue Tbl1 = Op->getOperand(0);
14773 SDValue Tbl2 = Op->getOperand(1);
14774 SDLoc DL(Op);
14775 SDValue Tbl2ID =
14776 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i64);
14777
14778 EVT VT = Op.getValueType();
14779 if (Tbl1.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
14780 Tbl1.getOperand(0) != Tbl2ID ||
14782 Tbl2.getOperand(0) != Tbl2ID)
14783 return SDValue();
14784
14785 if (Tbl1.getValueType() != MVT::v16i8 || Tbl2.getValueType() != MVT::v16i8)
14786 return SDValue();
14787
14788 SDValue Mask1 = Tbl1.getOperand(3);
14789 SDValue Mask2 = Tbl2.getOperand(3);
14790 if (Mask1.getOpcode() != ISD::BUILD_VECTOR ||
14791 Mask2.getOpcode() != ISD::BUILD_VECTOR)
14792 return SDValue();
14793
14794 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
14795 for (unsigned I = 0; I < 16; I++) {
14796 if (ShuffleMask[I] < 16)
14797 TBLMaskParts[I] = Mask1.getOperand(ShuffleMask[I]);
14798 else {
14799 auto *C = dyn_cast<ConstantSDNode>(Mask2.getOperand(ShuffleMask[I] - 16));
14800 if (!C)
14801 return SDValue();
14802 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, DL, MVT::i32);
14803 }
14804 }
14805
14806 SDValue TBLMask = DAG.getBuildVector(VT, DL, TBLMaskParts);
14807 SDValue ID =
14808 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, DL, MVT::i64);
14809
14810 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v16i8,
14811 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
14812 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
14813}
14814
14815SDValue
14816AArch64TargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op,
14817 SelectionDAG &DAG) const {
14818 SDLoc DL(Op);
14819 EVT VT = Op.getValueType();
14820 assert(VT.isScalableVector() && "Unexpected result type!");
14821
14822 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG;
14823 unsigned UnpackOpcode = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14824
14825 // Repeatedly unpack Val until the result is of the desired type.
14826 SDValue Val = Op.getOperand(0);
14827 switch (Val.getSimpleValueType().SimpleTy) {
14828 default:
14829 return SDValue();
14830 case MVT::nxv16i8:
14831 Val = DAG.getNode(UnpackOpcode, DL, MVT::nxv8i16, Val);
14832 if (VT == MVT::nxv8i16)
14833 break;
14834 [[fallthrough]];
14835 case MVT::nxv8i16:
14836 Val = DAG.getNode(UnpackOpcode, DL, MVT::nxv4i32, Val);
14837 if (VT == MVT::nxv4i32)
14838 break;
14839 [[fallthrough]];
14840 case MVT::nxv4i32:
14841 Val = DAG.getNode(UnpackOpcode, DL, MVT::nxv2i64, Val);
14842 assert(VT == MVT::nxv2i64 && "Unexpected result type!");
14843 break;
14844 }
14845
14846 return Val;
14847}
14848
14849// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
14850// but we don't have an appropriate instruction,
14851// so custom-lower it as ZIP1-with-zeros.
14852SDValue
14853AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
14854 SelectionDAG &DAG) const {
14855 SDLoc DL(Op);
14856 EVT VT = Op.getValueType();
14857
14858 if (VT.isScalableVector())
14859 return LowerEXTEND_VECTOR_INREG(Op, DAG);
14860
14861 SDValue SrcOp = Op.getOperand(0);
14862 EVT SrcVT = SrcOp.getValueType();
14863 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
14864 "Unexpected extension factor.");
14865 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
14866 // FIXME: support multi-step zipping?
14867 if (Scale != 2)
14868 return SDValue();
14869 SDValue Zeros = DAG.getConstant(0, DL, SrcVT);
14870 return DAG.getBitcast(VT,
14871 DAG.getNode(AArch64ISD::ZIP1, DL, SrcVT, SrcOp, Zeros));
14872}
14873
14874SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
14875 SelectionDAG &DAG) const {
14876 SDLoc DL(Op);
14877 EVT VT = Op.getValueType();
14878
14879 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
14880
14881 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14882 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
14883
14884 // Convert shuffles that are directly supported on NEON to target-specific
14885 // DAG nodes, instead of keeping them as shuffles and matching them again
14886 // during code selection. This is more efficient and avoids the possibility
14887 // of inconsistencies between legalization and selection.
14888 ArrayRef<int> ShuffleMask = SVN->getMask();
14889
14890 SDValue V1 = Op.getOperand(0);
14891 SDValue V2 = Op.getOperand(1);
14892
14893 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
14894 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
14895 "Unexpected VECTOR_SHUFFLE mask size!");
14896
14897 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
14898 return Res;
14899
14900 if (SVN->isSplat()) {
14901 int Lane = SVN->getSplatIndex();
14902 // If this is undef splat, generate it via "just" vdup, if possible.
14903 if (Lane == -1)
14904 Lane = 0;
14905
14906 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
14907 return DAG.getNode(AArch64ISD::DUP, DL, V1.getValueType(),
14908 V1.getOperand(0));
14909 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
14910 // constant. If so, we can just reference the lane's definition directly.
14911 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
14913 return DAG.getNode(AArch64ISD::DUP, DL, VT, V1.getOperand(Lane));
14914
14915 // Otherwise, duplicate from the lane of the input vector.
14916 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
14917 return constructDup(V1, Lane, DL, VT, Opcode, DAG);
14918 }
14919
14920 // Check if the mask matches a DUP for a wider element
14921 for (unsigned LaneSize : {64U, 32U, 16U}) {
14922 unsigned Lane = 0;
14923 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
14924 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
14925 : LaneSize == 32 ? AArch64ISD::DUPLANE32
14926 : AArch64ISD::DUPLANE16;
14927 // Cast V1 to an integer vector with required lane size
14928 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
14929 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
14930 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
14931 V1 = DAG.getBitcast(NewVecTy, V1);
14932 // Construct the DUP instruction
14933 V1 = constructDup(V1, Lane, DL, NewVecTy, Opcode, DAG);
14934 // Cast back to the original type
14935 return DAG.getBitcast(VT, V1);
14936 }
14937 }
14938
14939 unsigned NumElts = VT.getVectorNumElements();
14940 unsigned EltSize = VT.getScalarSizeInBits();
14941 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
14942 return DAG.getNode(AArch64ISD::REV64, DL, V1.getValueType(), V1);
14943 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
14944 return DAG.getNode(AArch64ISD::REV32, DL, V1.getValueType(), V1);
14945 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
14946 return DAG.getNode(AArch64ISD::REV16, DL, V1.getValueType(), V1);
14947
14948 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
14949 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
14950 SDValue Rev = DAG.getNode(AArch64ISD::REV64, DL, VT, V1);
14951 return DAG.getNode(AArch64ISD::EXT, DL, VT, Rev, Rev,
14952 DAG.getConstant(8, DL, MVT::i32));
14953 }
14954
14955 bool ReverseEXT = false;
14956 unsigned Imm;
14957 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
14958 if (ReverseEXT)
14959 std::swap(V1, V2);
14960 Imm *= getExtFactor(V1);
14961 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V2,
14962 DAG.getConstant(Imm, DL, MVT::i32));
14963 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
14964 Imm *= getExtFactor(V1);
14965 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V1,
14966 DAG.getConstant(Imm, DL, MVT::i32));
14967 }
14968
14969 unsigned WhichResult;
14970 unsigned OperandOrder;
14971 if (isZIPMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) {
14972 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14973 return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2,
14974 OperandOrder == 0 ? V2 : V1);
14975 }
14976 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
14977 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14978 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14979 }
14980 if (isTRNMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) {
14981 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14982 return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2,
14983 OperandOrder == 0 ? V2 : V1);
14984 }
14985
14986 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14987 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14988 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14989 }
14990 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14991 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14992 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14993 }
14994 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14995 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14996 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14997 }
14998
15000 return Concat;
15001
15002 bool DstIsLeft;
15003 int Anomaly;
15004 int NumInputElements = V1.getValueType().getVectorNumElements();
15005 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
15006 SDValue DstVec = DstIsLeft ? V1 : V2;
15007 SDValue DstLaneV = DAG.getConstant(Anomaly, DL, MVT::i64);
15008
15009 SDValue SrcVec = V1;
15010 int SrcLane = ShuffleMask[Anomaly];
15011 if (SrcLane >= NumInputElements) {
15012 SrcVec = V2;
15013 SrcLane -= NumElts;
15014 }
15015 SDValue SrcLaneV = DAG.getConstant(SrcLane, DL, MVT::i64);
15016
15017 EVT ScalarVT = VT.getVectorElementType();
15018
15019 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
15020 ScalarVT = MVT::i32;
15021
15022 return DAG.getNode(
15023 ISD::INSERT_VECTOR_ELT, DL, VT, DstVec,
15024 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SrcVec, SrcLaneV),
15025 DstLaneV);
15026 }
15027
15028 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
15029 return NewSD;
15030
15031 // If the shuffle is not directly supported and it has 4 elements, use
15032 // the PerfectShuffle-generated table to synthesize it from other shuffles.
15033 if (NumElts == 4) {
15034 unsigned PFIndexes[4];
15035 for (unsigned i = 0; i != 4; ++i) {
15036 if (ShuffleMask[i] < 0)
15037 PFIndexes[i] = 8;
15038 else
15039 PFIndexes[i] = ShuffleMask[i];
15040 }
15041
15042 // Compute the index in the perfect shuffle table.
15043 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
15044 PFIndexes[2] * 9 + PFIndexes[3];
15045 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
15046 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
15047 DL);
15048 }
15049
15050 // Check for a "select shuffle", generating a BSL to pick between lanes in
15051 // V1/V2.
15052 if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
15053 assert(VT.getScalarSizeInBits() <= 32 &&
15054 "Expected larger vector element sizes to be handled already");
15055 SmallVector<SDValue> MaskElts;
15056 for (int M : ShuffleMask)
15057 MaskElts.push_back(DAG.getConstant(
15058 M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, DL, MVT::i32));
15059 EVT IVT = VT.changeVectorElementTypeToInteger();
15060 SDValue MaskConst = DAG.getBuildVector(IVT, DL, MaskElts);
15061 return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, DL, IVT, MaskConst,
15062 DAG.getBitcast(IVT, V1),
15063 DAG.getBitcast(IVT, V2)));
15064 }
15065
15066 // Fall back to generating a TBL
15067 return GenerateTBL(Op, ShuffleMask, DAG);
15068}
15069
15070SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
15071 SelectionDAG &DAG) const {
15072 EVT VT = Op.getValueType();
15073
15074 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15075 return LowerToScalableOp(Op, DAG);
15076
15077 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
15078 "Unexpected vector type!");
15079
15080 // We can handle the constant cases during isel.
15081 if (isa<ConstantSDNode>(Op.getOperand(0)))
15082 return Op;
15083
15084 // There isn't a natural way to handle the general i1 case, so we use some
15085 // trickery with whilelo.
15086 SDLoc DL(Op);
15087 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
15088 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
15089 DAG.getValueType(MVT::i1));
15090 SDValue ID =
15091 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
15092 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
15093 if (VT == MVT::nxv1i1)
15094 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
15095 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
15096 Zero, SplatVal),
15097 Zero);
15098 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
15099}
15100
15101SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
15102 SelectionDAG &DAG) const {
15103 SDLoc DL(Op);
15104
15105 EVT VT = Op.getValueType();
15106 if (!isTypeLegal(VT) || !VT.isScalableVector())
15107 return SDValue();
15108
15109 // Current lowering only supports the SVE-ACLE types.
15111 return SDValue();
15112
15113 // The DUPQ operation is independent of element type so normalise to i64s.
15114 SDValue Idx128 = Op.getOperand(2);
15115
15116 // DUPQ can be used when idx is in range.
15117 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
15118 if (CIdx && (CIdx->getZExtValue() <= 3)) {
15119 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
15120 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
15121 }
15122
15123 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
15124
15125 // The ACLE says this must produce the same result as:
15126 // svtbl(data, svadd_x(svptrue_b64(),
15127 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
15128 // index * 2))
15129 SDValue One = DAG.getConstant(1, DL, MVT::i64);
15130 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
15131
15132 // create the vector 0,1,0,1,...
15133 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
15134 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
15135
15136 // create the vector idx64,idx64+1,idx64,idx64+1,...
15137 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
15138 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
15139 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
15140
15141 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
15142 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
15143 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
15144}
15145
15146
15147static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
15148 APInt &UndefBits) {
15149 EVT VT = BVN->getValueType(0);
15150 APInt SplatBits, SplatUndef;
15151 unsigned SplatBitSize;
15152 bool HasAnyUndefs;
15153 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
15154 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
15155
15156 for (unsigned i = 0; i < NumSplats; ++i) {
15157 CnstBits <<= SplatBitSize;
15158 UndefBits <<= SplatBitSize;
15159 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
15160 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
15161 }
15162
15163 return true;
15164 }
15165
15166 return false;
15167}
15168
15169// Try 64-bit splatted SIMD immediate.
15170static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15171 const APInt &Bits) {
15172 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15173 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15174 EVT VT = Op.getValueType();
15175 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
15176
15179
15180 SDLoc DL(Op);
15181 SDValue Mov =
15182 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15183 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15184 }
15185 }
15186
15187 return SDValue();
15188}
15189
15190// Try 32-bit splatted SIMD immediate.
15191static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15192 const APInt &Bits,
15193 const SDValue *LHS = nullptr) {
15194 EVT VT = Op.getValueType();
15195 if (VT.isFixedLengthVector() &&
15197 return SDValue();
15198
15199 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15200 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15201 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
15202 bool isAdvSIMDModImm = false;
15203 uint64_t Shift;
15204
15205 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
15207 Shift = 0;
15208 }
15209 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
15211 Shift = 8;
15212 }
15213 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
15215 Shift = 16;
15216 }
15217 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
15219 Shift = 24;
15220 }
15221
15222 if (isAdvSIMDModImm) {
15223 SDLoc DL(Op);
15224 SDValue Mov;
15225
15226 if (LHS)
15227 Mov = DAG.getNode(NewOp, DL, MovTy,
15228 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
15229 DAG.getConstant(Value, DL, MVT::i32),
15230 DAG.getConstant(Shift, DL, MVT::i32));
15231 else
15232 Mov =
15233 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
15234 DAG.getConstant(Shift, DL, MVT::i32));
15235
15236 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15237 }
15238 }
15239
15240 return SDValue();
15241}
15242
15243// Try 16-bit splatted SIMD immediate.
15244static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15245 const APInt &Bits,
15246 const SDValue *LHS = nullptr) {
15247 EVT VT = Op.getValueType();
15248 if (VT.isFixedLengthVector() &&
15250 return SDValue();
15251
15252 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15253 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15254 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
15255 bool isAdvSIMDModImm = false;
15256 uint64_t Shift;
15257
15258 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
15260 Shift = 0;
15261 }
15262 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
15264 Shift = 8;
15265 }
15266
15267 if (isAdvSIMDModImm) {
15268 SDLoc DL(Op);
15269 SDValue Mov;
15270
15271 if (LHS)
15272 Mov = DAG.getNode(NewOp, DL, MovTy,
15273 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
15274 DAG.getConstant(Value, DL, MVT::i32),
15275 DAG.getConstant(Shift, DL, MVT::i32));
15276 else
15277 Mov =
15278 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
15279 DAG.getConstant(Shift, DL, MVT::i32));
15280
15281 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15282 }
15283 }
15284
15285 return SDValue();
15286}
15287
15288// Try 32-bit splatted SIMD immediate with shifted ones.
15290 SelectionDAG &DAG, const APInt &Bits) {
15291 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15292 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15293 EVT VT = Op.getValueType();
15294 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
15295 bool isAdvSIMDModImm = false;
15296 uint64_t Shift;
15297
15298 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
15300 Shift = 264;
15301 }
15302 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
15304 Shift = 272;
15305 }
15306
15307 if (isAdvSIMDModImm) {
15308 SDLoc DL(Op);
15309 SDValue Mov =
15310 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
15311 DAG.getConstant(Shift, DL, MVT::i32));
15312 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15313 }
15314 }
15315
15316 return SDValue();
15317}
15318
15319// Try 8-bit splatted SIMD immediate.
15320static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15321 const APInt &Bits) {
15322 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15323 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15324 EVT VT = Op.getValueType();
15325 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
15326
15329
15330 SDLoc DL(Op);
15331 SDValue Mov =
15332 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15333 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15334 }
15335 }
15336
15337 return SDValue();
15338}
15339
15340// Try FP splatted SIMD immediate.
15341static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15342 const APInt &Bits) {
15343 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15344 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15345 EVT VT = Op.getValueType();
15346 bool isWide = (VT.getSizeInBits() == 128);
15347 MVT MovTy;
15348 bool isAdvSIMDModImm = false;
15349
15350 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
15352 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
15353 }
15354 else if (isWide &&
15355 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
15357 MovTy = MVT::v2f64;
15358 }
15359
15360 if (isAdvSIMDModImm) {
15361 SDLoc DL(Op);
15362 SDValue Mov =
15363 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15364 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15365 }
15366 }
15367
15368 return SDValue();
15369}
15370
15371// Specialized code to quickly find if PotentialBVec is a BuildVector that
15372// consists of only the same constant int value, returned in reference arg
15373// ConstVal
15374static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
15375 uint64_t &ConstVal) {
15376 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
15377 if (!Bvec)
15378 return false;
15380 if (!FirstElt)
15381 return false;
15382 EVT VT = Bvec->getValueType(0);
15383 unsigned NumElts = VT.getVectorNumElements();
15384 for (unsigned i = 1; i < NumElts; ++i)
15385 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
15386 return false;
15387 ConstVal = FirstElt->getZExtValue();
15388 return true;
15389}
15390
15392 // Look through cast.
15393 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
15394 N = N.getOperand(0);
15395
15396 return ISD::isConstantSplatVectorAllZeros(N.getNode());
15397}
15398
15400 unsigned NumElts = N.getValueType().getVectorMinNumElements();
15401
15402 // Look through cast.
15403 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
15404 N = N.getOperand(0);
15405 // When reinterpreting from a type with fewer elements the "new" elements
15406 // are not active, so bail if they're likely to be used.
15407 if (N.getValueType().getVectorMinNumElements() < NumElts)
15408 return false;
15409 }
15410
15411 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
15412 return true;
15413
15414 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
15415 // or smaller than the implicit element type represented by N.
15416 // NOTE: A larger element count implies a smaller element type.
15417 if (N.getOpcode() == AArch64ISD::PTRUE &&
15418 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
15419 return N.getValueType().getVectorMinNumElements() >= NumElts;
15420
15421 // If we're compiling for a specific vector-length, we can check if the
15422 // pattern's VL equals that of the scalable vector at runtime.
15423 if (N.getOpcode() == AArch64ISD::PTRUE) {
15424 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15425 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
15426 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
15427 if (MaxSVESize && MinSVESize == MaxSVESize) {
15428 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
15429 unsigned PatNumElts =
15430 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
15431 return PatNumElts == (NumElts * VScale);
15432 }
15433 }
15434
15435 return false;
15436}
15437
15438// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
15439// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
15440// BUILD_VECTORs with constant element C1, C2 is a constant, and:
15441// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
15442// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
15443// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
15445 EVT VT = N->getValueType(0);
15446
15447 if (!VT.isVector())
15448 return SDValue();
15449
15450 SDLoc DL(N);
15451
15452 SDValue And;
15453 SDValue Shift;
15454
15455 SDValue FirstOp = N->getOperand(0);
15456 unsigned FirstOpc = FirstOp.getOpcode();
15457 SDValue SecondOp = N->getOperand(1);
15458 unsigned SecondOpc = SecondOp.getOpcode();
15459
15460 // Is one of the operands an AND or a BICi? The AND may have been optimised to
15461 // a BICi in order to use an immediate instead of a register.
15462 // Is the other operand an shl or lshr? This will have been turned into:
15463 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
15464 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
15465 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
15466 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
15467 SecondOpc == AArch64ISD::SHL_PRED ||
15468 SecondOpc == AArch64ISD::SRL_PRED)) {
15469 And = FirstOp;
15470 Shift = SecondOp;
15471
15472 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
15473 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
15474 FirstOpc == AArch64ISD::SHL_PRED ||
15475 FirstOpc == AArch64ISD::SRL_PRED)) {
15476 And = SecondOp;
15477 Shift = FirstOp;
15478 } else
15479 return SDValue();
15480
15481 bool IsAnd = And.getOpcode() == ISD::AND;
15482 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
15483 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15484 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
15485 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15486
15487 // Is the shift amount constant and are all lanes active?
15488 uint64_t C2;
15489 if (ShiftHasPredOp) {
15490 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
15491 return SDValue();
15492 APInt C;
15494 return SDValue();
15495 C2 = C.getZExtValue();
15496 } else if (ConstantSDNode *C2node =
15498 C2 = C2node->getZExtValue();
15499 else
15500 return SDValue();
15501
15502 APInt C1AsAPInt;
15503 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
15504 if (IsAnd) {
15505 // Is the and mask vector all constant?
15506 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
15507 return SDValue();
15508 } else {
15509 // Reconstruct the corresponding AND immediate from the two BICi immediates.
15510 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
15511 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
15512 assert(C1nodeImm && C1nodeShift);
15513 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
15514 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
15515 }
15516
15517 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
15518 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
15519 // how much one can shift elements of a particular size?
15520 if (C2 > ElemSizeInBits)
15521 return SDValue();
15522
15523 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
15524 : APInt::getLowBitsSet(ElemSizeInBits, C2);
15525 if (C1AsAPInt != RequiredC1)
15526 return SDValue();
15527
15528 SDValue X = And.getOperand(0);
15529 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
15530 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
15531 : Shift.getOperand(1);
15532
15533 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
15534 return DAG.getNode(Inst, DL, VT, X, Y, Imm);
15535}
15536
15538 EVT VT = N->getValueType(0);
15539 assert(VT.isVector() && "Expected vector type in tryLowerToBSL\n");
15540 SDLoc DL(N);
15541 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15542
15543 if (VT.isScalableVector() && !Subtarget.hasSVE2())
15544 return SDValue();
15545
15546 SDValue N0 = N->getOperand(0);
15547 if (N0.getOpcode() != ISD::AND)
15548 return SDValue();
15549
15550 SDValue N1 = N->getOperand(1);
15551 if (N1.getOpcode() != ISD::AND)
15552 return SDValue();
15553
15554 // InstCombine does (not (neg a)) => (add a -1).
15555 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
15556 // Loop over all combinations of AND operands.
15557 for (int i = 1; i >= 0; --i) {
15558 for (int j = 1; j >= 0; --j) {
15559 SDValue O0 = N0->getOperand(i);
15560 SDValue O1 = N1->getOperand(j);
15561 SDValue Sub, Add, SubSibling, AddSibling;
15562
15563 // Find a SUB and an ADD operand, one from each AND.
15564 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
15565 Sub = O0;
15566 Add = O1;
15567 SubSibling = N0->getOperand(1 - i);
15568 AddSibling = N1->getOperand(1 - j);
15569 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
15570 Add = O0;
15571 Sub = O1;
15572 AddSibling = N0->getOperand(1 - i);
15573 SubSibling = N1->getOperand(1 - j);
15574 } else
15575 continue;
15576
15577 if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
15578 continue;
15579
15580 // Constant ones is always righthand operand of the Add.
15581 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
15582 continue;
15583
15584 if (Sub.getOperand(1) != Add.getOperand(0))
15585 continue;
15586
15587 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
15588 }
15589 }
15590
15591 // (or (and a b) (and (not a) c)) => (bsl a b c)
15592 // We only have to look for constant vectors here since the general, variable
15593 // case can be handled in TableGen.
15594 unsigned Bits = VT.getScalarSizeInBits();
15595 for (int i = 1; i >= 0; --i)
15596 for (int j = 1; j >= 0; --j) {
15597 APInt Val1, Val2;
15598
15599 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
15601 ~Val1.trunc(Bits) == Val2.trunc(Bits)) {
15602 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15603 N0->getOperand(1 - i), N1->getOperand(1 - j));
15604 }
15607 if (!BVN0 || !BVN1)
15608 continue;
15609
15610 bool FoundMatch = true;
15611 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
15614 if (!CN0 || !CN1 ||
15615 CN0->getAPIntValue().trunc(Bits) !=
15616 ~CN1->getAsAPIntVal().trunc(Bits)) {
15617 FoundMatch = false;
15618 break;
15619 }
15620 }
15621 if (FoundMatch)
15622 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15623 N0->getOperand(1 - i), N1->getOperand(1 - j));
15624 }
15625
15626 return SDValue();
15627}
15628
15629SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
15630 SelectionDAG &DAG) const {
15631 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15632 !Subtarget->isNeonAvailable()))
15633 return LowerToScalableOp(Op, DAG);
15634
15635 if (SDValue Res = tryLowerToBSL(Op, DAG))
15636 return Res;
15637
15638 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
15639 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
15640 return Res;
15641
15642 EVT VT = Op.getValueType();
15643 if (VT.isScalableVector())
15644 return Op;
15645
15646 SDValue LHS = Op.getOperand(0);
15647 BuildVectorSDNode *BVN =
15648 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
15649 if (!BVN) {
15650 // OR commutes, so try swapping the operands.
15651 LHS = Op.getOperand(1);
15652 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
15653 }
15654 if (!BVN)
15655 return Op;
15656
15657 APInt DefBits(VT.getSizeInBits(), 0);
15658 APInt UndefBits(VT.getSizeInBits(), 0);
15659 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15660 SDValue NewOp;
15661
15662 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15663 DefBits, &LHS)) ||
15664 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15665 DefBits, &LHS)))
15666 return NewOp;
15667
15668 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15669 UndefBits, &LHS)) ||
15670 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15671 UndefBits, &LHS)))
15672 return NewOp;
15673 }
15674
15675 // We can always fall back to a non-immediate OR.
15676 return Op;
15677}
15678
15679// Normalize the operands of BUILD_VECTOR. The value of constant operands will
15680// be truncated to fit element width.
15682 SelectionDAG &DAG) {
15683 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
15684 SDLoc DL(Op);
15685 EVT VT = Op.getValueType();
15686 EVT EltTy= VT.getVectorElementType();
15687
15688 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
15689 return Op;
15690
15692 for (SDValue Lane : Op->ops()) {
15693 // For integer vectors, type legalization would have promoted the
15694 // operands already. Otherwise, if Op is a floating-point splat
15695 // (with operands cast to integers), then the only possibilities
15696 // are constants and UNDEFs.
15697 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
15698 Lane = DAG.getConstant(
15699 CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
15700 DL, MVT::i32);
15701 } else if (Lane.getNode()->isUndef()) {
15702 Lane = DAG.getUNDEF(MVT::i32);
15703 } else {
15704 assert(Lane.getValueType() == MVT::i32 &&
15705 "Unexpected BUILD_VECTOR operand type");
15706 }
15707 Ops.push_back(Lane);
15708 }
15709 return DAG.getBuildVector(VT, DL, Ops);
15710}
15711
15713 const AArch64Subtarget *ST, APInt &DefBits) {
15714 EVT VT = Op.getValueType();
15715 // TODO: We should be able to support 64-bit destinations too
15716 if (!ST->hasSVE() || !VT.is128BitVector() ||
15717 DefBits.getHiBits(64) != DefBits.getLoBits(64))
15718 return SDValue();
15719
15720 // See if we can make use of the SVE dup instruction.
15721 APInt Val64 = DefBits.trunc(64);
15722 int32_t ImmVal, ShiftVal;
15723 if (!AArch64_AM::isSVECpyDupImm(64, Val64.getSExtValue(), ImmVal, ShiftVal))
15724 return SDValue();
15725
15726 SDLoc DL(Op);
15727 SDValue SplatVal = DAG.getSplatVector(MVT::nxv2i64, DL,
15728 DAG.getConstant(Val64, DL, MVT::i64));
15729 SDValue Res = convertFromScalableVector(DAG, MVT::v2i64, SplatVal);
15730 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Res);
15731}
15732
15734 const AArch64Subtarget *ST) {
15735 EVT VT = Op.getValueType();
15736 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
15737 "Expected a legal NEON vector");
15738
15739 APInt DefBits(VT.getSizeInBits(), 0);
15740 APInt UndefBits(VT.getSizeInBits(), 0);
15742 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15743 auto TryMOVIWithBits = [&](APInt DefBits) {
15744 SDValue NewOp;
15745 if ((NewOp =
15746 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
15747 (NewOp =
15748 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15749 (NewOp =
15750 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
15751 (NewOp =
15752 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15753 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
15754 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
15755 return NewOp;
15756
15757 APInt NotDefBits = ~DefBits;
15758 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
15759 NotDefBits)) ||
15760 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG,
15761 NotDefBits)) ||
15762 (NewOp =
15763 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
15764 return NewOp;
15765 return SDValue();
15766 };
15767 if (SDValue R = TryMOVIWithBits(DefBits))
15768 return R;
15769 if (SDValue R = TryMOVIWithBits(UndefBits))
15770 return R;
15771
15772 // Try to materialise the constant using SVE when available.
15773 if (SDValue R = trySVESplat64(Op, DAG, ST, DefBits))
15774 return R;
15775
15776 // See if a fneg of the constant can be materialized with a MOVI, etc
15777 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
15778 // FNegate each sub-element of the constant
15779 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
15780 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
15781 .zext(VT.getSizeInBits());
15782 APInt NegBits(VT.getSizeInBits(), 0);
15783 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
15784 for (unsigned i = 0; i < NumElts; i++)
15785 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
15786 NegBits = DefBits ^ NegBits;
15787
15788 // Try to create the new constants with MOVI, and if so generate a fneg
15789 // for it.
15790 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
15791 SDLoc DL(Op);
15792 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
15793 return DAG.getNode(
15794 AArch64ISD::NVCAST, DL, VT,
15795 DAG.getNode(ISD::FNEG, DL, VFVT,
15796 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
15797 }
15798 return SDValue();
15799 };
15800 SDValue R;
15801 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
15802 (R = TryWithFNeg(DefBits, MVT::f64)) ||
15803 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
15804 return R;
15805 }
15806
15807 return SDValue();
15808}
15809
15810SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
15811 SDValue Op, SelectionDAG &DAG) const {
15812 EVT VT = Op.getValueType();
15813 SDLoc DL(Op);
15814 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
15815 auto *BVN = cast<BuildVectorSDNode>(Op);
15816
15817 if (auto SeqInfo = BVN->isConstantSequence()) {
15818 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
15819 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
15820 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
15821 return convertFromScalableVector(DAG, VT, Seq);
15822 }
15823
15824 unsigned NumElems = VT.getVectorNumElements();
15825 if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
15826 NumElems <= 1 || BVN->isConstant())
15827 return SDValue();
15828
15829 auto IsExtractElt = [](SDValue Op) {
15830 return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
15831 };
15832
15833 // For integer types that are not already in vectors limit to at most four
15834 // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
15835 if (VT.getScalarType().isInteger() &&
15836 NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
15837 return SDValue();
15838
15839 // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
15840 SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
15842 Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
15843 return Op.isUndef() ? Undef
15844 : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
15845 ContainerVT, Undef, Op, ZeroI64);
15846 });
15847
15848 ElementCount ZipEC = ContainerVT.getVectorElementCount();
15849 while (Intermediates.size() > 1) {
15850 EVT ZipVT = getPackedSVEVectorVT(ZipEC);
15851
15852 for (unsigned I = 0; I < Intermediates.size(); I += 2) {
15853 SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
15854 SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
15855 Intermediates[I / 2] =
15856 Op1.isUndef() ? Op0
15857 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
15858 }
15859
15860 Intermediates.resize(Intermediates.size() / 2);
15861 ZipEC = ZipEC.divideCoefficientBy(2);
15862 }
15863
15864 assert(Intermediates.size() == 1);
15865 SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
15866 return convertFromScalableVector(DAG, VT, Vec);
15867}
15868
15869SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
15870 SelectionDAG &DAG) const {
15871 EVT VT = Op.getValueType();
15872
15873 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
15874 cast<BuildVectorSDNode>(Op)->isConstantSequence();
15875 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
15876 return LowerFixedLengthBuildVectorToSVE(Op, DAG);
15877
15878 // Try to build a simple constant vector.
15879 Op = NormalizeBuildVector(Op, DAG);
15880 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
15881 // abort.
15882 if (Op.getOpcode() != ISD::BUILD_VECTOR)
15883 return SDValue();
15884
15885 // Certain vector constants, used to express things like logical NOT and
15886 // arithmetic NEG, are passed through unmodified. This allows special
15887 // patterns for these operations to match, which will lower these constants
15888 // to whatever is proven necessary.
15889 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
15890 if (BVN->isConstant()) {
15891 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
15892 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
15893 APInt Val(BitSize,
15894 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
15895 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
15896 return Op;
15897 }
15898 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
15899 if (Const->isZero() && !Const->isNegative())
15900 return Op;
15901 }
15902
15903 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
15904 return V;
15905
15906 // Scan through the operands to find some interesting properties we can
15907 // exploit:
15908 // 1) If only one value is used, we can use a DUP, or
15909 // 2) if only the low element is not undef, we can just insert that, or
15910 // 3) if only one constant value is used (w/ some non-constant lanes),
15911 // we can splat the constant value into the whole vector then fill
15912 // in the non-constant lanes.
15913 // 4) FIXME: If different constant values are used, but we can intelligently
15914 // select the values we'll be overwriting for the non-constant
15915 // lanes such that we can directly materialize the vector
15916 // some other way (MOVI, e.g.), we can be sneaky.
15917 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
15918 SDLoc DL(Op);
15919 unsigned NumElts = VT.getVectorNumElements();
15920 bool isOnlyLowElement = true;
15921 bool usesOnlyOneValue = true;
15922 bool usesOnlyOneConstantValue = true;
15923 bool isConstant = true;
15924 bool AllLanesExtractElt = true;
15925 unsigned NumConstantLanes = 0;
15926 unsigned NumDifferentLanes = 0;
15927 unsigned NumUndefLanes = 0;
15928 SDValue Value;
15929 SDValue ConstantValue;
15930 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
15931 unsigned ConsecutiveValCount = 0;
15932 SDValue PrevVal;
15933 for (unsigned i = 0; i < NumElts; ++i) {
15934 SDValue V = Op.getOperand(i);
15935 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15936 AllLanesExtractElt = false;
15937 if (V.isUndef()) {
15938 ++NumUndefLanes;
15939 continue;
15940 }
15941 if (i > 0)
15942 isOnlyLowElement = false;
15943 if (!isIntOrFPConstant(V))
15944 isConstant = false;
15945
15946 if (isIntOrFPConstant(V)) {
15947 ++NumConstantLanes;
15948 if (!ConstantValue.getNode())
15949 ConstantValue = V;
15950 else if (ConstantValue != V)
15951 usesOnlyOneConstantValue = false;
15952 }
15953
15954 if (!Value.getNode())
15955 Value = V;
15956 else if (V != Value) {
15957 usesOnlyOneValue = false;
15958 ++NumDifferentLanes;
15959 }
15960
15961 if (PrevVal != V) {
15962 ConsecutiveValCount = 0;
15963 PrevVal = V;
15964 }
15965
15966 // Keep different values and its last consecutive count. For example,
15967 //
15968 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15969 // t24, t24, t24, t24, t24, t24, t24, t24
15970 // t23 = consecutive count 8
15971 // t24 = consecutive count 8
15972 // ------------------------------------------------------------------
15973 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
15974 // t24, t24, t24, t24, t24, t24, t24, t24
15975 // t23 = consecutive count 5
15976 // t24 = consecutive count 9
15977 DifferentValueMap[V] = ++ConsecutiveValCount;
15978 }
15979
15980 if (!Value.getNode()) {
15981 LLVM_DEBUG(
15982 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
15983 return DAG.getUNDEF(VT);
15984 }
15985
15986 // Convert BUILD_VECTOR where all elements but the lowest are undef into
15987 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
15988 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
15989 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
15990 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
15991 "SCALAR_TO_VECTOR node\n");
15992 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
15993 }
15994
15995 if (AllLanesExtractElt) {
15996 SDNode *Vector = nullptr;
15997 bool Even = false;
15998 bool Odd = false;
15999 // Check whether the extract elements match the Even pattern <0,2,4,...> or
16000 // the Odd pattern <1,3,5,...>.
16001 for (unsigned i = 0; i < NumElts; ++i) {
16002 SDValue V = Op.getOperand(i);
16003 const SDNode *N = V.getNode();
16004 if (!isa<ConstantSDNode>(N->getOperand(1))) {
16005 Even = false;
16006 Odd = false;
16007 break;
16008 }
16009 SDValue N0 = N->getOperand(0);
16010
16011 // All elements are extracted from the same vector.
16012 if (!Vector) {
16013 Vector = N0.getNode();
16014 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
16015 // BUILD_VECTOR.
16016 if (VT.getVectorElementType() !=
16018 break;
16019 } else if (Vector != N0.getNode()) {
16020 Odd = false;
16021 Even = false;
16022 break;
16023 }
16024
16025 // Extracted values are either at Even indices <0,2,4,...> or at Odd
16026 // indices <1,3,5,...>.
16027 uint64_t Val = N->getConstantOperandVal(1);
16028 if (Val == 2 * i) {
16029 Even = true;
16030 continue;
16031 }
16032 if (Val - 1 == 2 * i) {
16033 Odd = true;
16034 continue;
16035 }
16036
16037 // Something does not match: abort.
16038 Odd = false;
16039 Even = false;
16040 break;
16041 }
16042 if (Even || Odd) {
16043 SDValue LHS =
16045 DAG.getConstant(0, DL, MVT::i64));
16046 SDValue RHS =
16048 DAG.getConstant(NumElts, DL, MVT::i64));
16049
16050 if (Even && !Odd)
16051 return DAG.getNode(AArch64ISD::UZP1, DL, VT, LHS, RHS);
16052 if (Odd && !Even)
16053 return DAG.getNode(AArch64ISD::UZP2, DL, VT, LHS, RHS);
16054 }
16055 }
16056
16057 // Use DUP for non-constant splats. For f32 constant splats, reduce to
16058 // i32 and try again.
16059 if (usesOnlyOneValue) {
16060 if (!isConstant) {
16061 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16062 Value.getValueType() != VT) {
16063 LLVM_DEBUG(
16064 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
16065 return DAG.getNode(AArch64ISD::DUP, DL, VT, Value);
16066 }
16067
16068 // This is actually a DUPLANExx operation, which keeps everything vectory.
16069
16070 SDValue Lane = Value.getOperand(1);
16071 Value = Value.getOperand(0);
16072 if (Value.getValueSizeInBits() == 64) {
16073 LLVM_DEBUG(
16074 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
16075 "widening it\n");
16076 Value = WidenVector(Value, DAG);
16077 }
16078
16079 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
16080 return DAG.getNode(Opcode, DL, VT, Value, Lane);
16081 }
16082
16085 EVT EltTy = VT.getVectorElementType();
16086 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
16087 EltTy == MVT::f64) && "Unsupported floating-point vector type");
16088 LLVM_DEBUG(
16089 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
16090 "BITCASTS, and try again\n");
16091 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
16092 for (unsigned i = 0; i < NumElts; ++i)
16093 Ops.push_back(DAG.getNode(ISD::BITCAST, DL, NewType, Op.getOperand(i)));
16094 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
16095 SDValue Val = DAG.getBuildVector(VecVT, DL, Ops);
16096 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
16097 Val.dump(););
16098 Val = LowerBUILD_VECTOR(Val, DAG);
16099 if (Val.getNode())
16100 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
16101 }
16102 }
16103
16104 // If we need to insert a small number of different non-constant elements and
16105 // the vector width is sufficiently large, prefer using DUP with the common
16106 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
16107 // skip the constant lane handling below.
16108 bool PreferDUPAndInsert =
16109 !isConstant && NumDifferentLanes >= 1 &&
16110 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
16111 NumDifferentLanes >= NumConstantLanes;
16112
16113 // If there was only one constant value used and for more than one lane,
16114 // start by splatting that value, then replace the non-constant lanes. This
16115 // is better than the default, which will perform a separate initialization
16116 // for each lane.
16117 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
16118 // Firstly, try to materialize the splat constant.
16119 SDValue Val = DAG.getSplatBuildVector(VT, DL, ConstantValue);
16120 unsigned BitSize = VT.getScalarSizeInBits();
16121 APInt ConstantValueAPInt(1, 0);
16122 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
16123 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
16124 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
16125 !ConstantValueAPInt.isAllOnes()) {
16126 Val = ConstantBuildVector(Val, DAG, Subtarget);
16127 if (!Val)
16128 // Otherwise, materialize the constant and splat it.
16129 Val = DAG.getNode(AArch64ISD::DUP, DL, VT, ConstantValue);
16130 }
16131
16132 // Now insert the non-constant lanes.
16133 for (unsigned i = 0; i < NumElts; ++i) {
16134 SDValue V = Op.getOperand(i);
16135 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
16136 if (!isIntOrFPConstant(V) && !V.isUndef())
16137 // Note that type legalization likely mucked about with the VT of the
16138 // source operand, so we may have to convert it here before inserting.
16139 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Val, V, LaneIdx);
16140 }
16141 return Val;
16142 }
16143
16144 // This will generate a load from the constant pool.
16145 if (isConstant) {
16146 LLVM_DEBUG(
16147 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
16148 "expansion\n");
16149 return SDValue();
16150 }
16151
16152 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
16153 // v4i32s. This is really a truncate, which we can construct out of (legal)
16154 // concats and truncate nodes.
16156 return M;
16157
16158 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
16159 if (NumElts >= 4) {
16160 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
16161 return Shuffle;
16162
16163 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
16164 return Shuffle;
16165 }
16166
16167 if (PreferDUPAndInsert) {
16168 // First, build a constant vector with the common element.
16170 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, DL, Ops), DAG);
16171 // Next, insert the elements that do not match the common value.
16172 for (unsigned I = 0; I < NumElts; ++I)
16173 if (Op.getOperand(I) != Value)
16174 NewVector =
16175 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NewVector,
16176 Op.getOperand(I), DAG.getConstant(I, DL, MVT::i64));
16177
16178 return NewVector;
16179 }
16180
16181 // If vector consists of two different values, try to generate two DUPs and
16182 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
16183 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
16185 // Check the consecutive count of the value is the half number of vector
16186 // elements. In this case, we can use CONCAT_VECTORS. For example,
16187 //
16188 // canUseVECTOR_CONCAT = true;
16189 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
16190 // t24, t24, t24, t24, t24, t24, t24, t24
16191 //
16192 // canUseVECTOR_CONCAT = false;
16193 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
16194 // t24, t24, t24, t24, t24, t24, t24, t24
16195 bool canUseVECTOR_CONCAT = true;
16196 for (auto Pair : DifferentValueMap) {
16197 // Check different values have same length which is NumElts / 2.
16198 if (Pair.second != NumElts / 2)
16199 canUseVECTOR_CONCAT = false;
16200 Vals.push_back(Pair.first);
16201 }
16202
16203 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
16204 // CONCAT_VECTORs. For example,
16205 //
16206 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
16207 // t24, t24, t24, t24, t24, t24, t24, t24
16208 // ==>
16209 // t26: v8i8 = AArch64ISD::DUP t23
16210 // t28: v8i8 = AArch64ISD::DUP t24
16211 // t29: v16i8 = concat_vectors t26, t28
16212 if (canUseVECTOR_CONCAT) {
16213 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16214 if (isTypeLegal(SubVT) && SubVT.isVector() &&
16215 SubVT.getVectorNumElements() >= 2) {
16216 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
16217 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
16218 SDValue DUP1 =
16219 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops1), DAG);
16220 SDValue DUP2 =
16221 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops2), DAG);
16223 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, DUP1, DUP2);
16224 return CONCAT_VECTORS;
16225 }
16226 }
16227
16228 // Let's try to generate VECTOR_SHUFFLE. For example,
16229 //
16230 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
16231 // ==>
16232 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
16233 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
16234 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
16235 if (NumElts >= 8) {
16236 SmallVector<int, 16> MaskVec;
16237 // Build mask for VECTOR_SHUFLLE.
16238 SDValue FirstLaneVal = Op.getOperand(0);
16239 for (unsigned i = 0; i < NumElts; ++i) {
16240 SDValue Val = Op.getOperand(i);
16241 if (FirstLaneVal == Val)
16242 MaskVec.push_back(i);
16243 else
16244 MaskVec.push_back(i + NumElts);
16245 }
16246
16247 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
16248 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
16249 SDValue VEC1 = DAG.getBuildVector(VT, DL, Ops1);
16250 SDValue VEC2 = DAG.getBuildVector(VT, DL, Ops2);
16252 DAG.getVectorShuffle(VT, DL, VEC1, VEC2, MaskVec);
16253 return VECTOR_SHUFFLE;
16254 }
16255 }
16256
16257 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
16258 // know the default expansion would otherwise fall back on something even
16259 // worse. For a vector with one or two non-undef values, that's
16260 // scalar_to_vector for the elements followed by a shuffle (provided the
16261 // shuffle is valid for the target) and materialization element by element
16262 // on the stack followed by a load for everything else.
16263 if (!isConstant && !usesOnlyOneValue) {
16264 LLVM_DEBUG(
16265 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
16266 "of INSERT_VECTOR_ELT\n");
16267
16268 SDValue Vec = DAG.getUNDEF(VT);
16269 SDValue Op0 = Op.getOperand(0);
16270 unsigned i = 0;
16271
16272 // Use SCALAR_TO_VECTOR for lane zero to
16273 // a) Avoid a RMW dependency on the full vector register, and
16274 // b) Allow the register coalescer to fold away the copy if the
16275 // value is already in an S or D register, and we're forced to emit an
16276 // INSERT_SUBREG that we can't fold anywhere.
16277 //
16278 // We also allow types like i8 and i16 which are illegal scalar but legal
16279 // vector element types. After type-legalization the inserted value is
16280 // extended (i32) and it is safe to cast them to the vector type by ignoring
16281 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
16282 if (!Op0.isUndef()) {
16283 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
16284 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Op0);
16285 ++i;
16286 }
16287 LLVM_DEBUG({
16288 if (i < NumElts)
16289 dbgs() << "Creating nodes for the other vector elements:\n";
16290 });
16291 for (; i < NumElts; ++i) {
16292 SDValue V = Op.getOperand(i);
16293 if (V.isUndef())
16294 continue;
16295 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
16296 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
16297 }
16298 return Vec;
16299 }
16300
16301 LLVM_DEBUG(
16302 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
16303 "better alternative\n");
16304 return SDValue();
16305}
16306
16307SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
16308 SelectionDAG &DAG) const {
16309 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
16310 !Subtarget->isNeonAvailable()))
16311 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
16312
16313 assert(Op.getValueType().isScalableVector() &&
16314 isTypeLegal(Op.getValueType()) &&
16315 "Expected legal scalable vector type!");
16316
16317 if (isTypeLegal(Op.getOperand(0).getValueType())) {
16318 unsigned NumOperands = Op->getNumOperands();
16319 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
16320 "Unexpected number of operands in CONCAT_VECTORS");
16321
16322 if (NumOperands == 2)
16323 return Op;
16324
16325 // Concat each pair of subvectors and pack into the lower half of the array.
16326 SmallVector<SDValue> ConcatOps(Op->ops());
16327 while (ConcatOps.size() > 1) {
16328 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
16329 SDValue V1 = ConcatOps[I];
16330 SDValue V2 = ConcatOps[I + 1];
16331 EVT SubVT = V1.getValueType();
16332 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
16333 ConcatOps[I / 2] =
16334 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
16335 }
16336 ConcatOps.resize(ConcatOps.size() / 2);
16337 }
16338 return ConcatOps[0];
16339 }
16340
16341 return SDValue();
16342}
16343
16344SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
16345 SelectionDAG &DAG) const {
16346 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
16347
16348 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
16349 !Subtarget->isNeonAvailable()))
16350 return LowerFixedLengthInsertVectorElt(Op, DAG);
16351
16352 EVT VT = Op.getOperand(0).getValueType();
16353
16354 if (VT.getScalarType() == MVT::i1) {
16355 EVT VectorVT = getPromotedVTForPredicate(VT);
16356 SDLoc DL(Op);
16357 SDValue ExtendedVector =
16358 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
16359 SDValue ExtendedValue =
16360 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
16361 VectorVT.getScalarType().getSizeInBits() < 32
16362 ? MVT::i32
16363 : VectorVT.getScalarType());
16364 ExtendedVector =
16365 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
16366 ExtendedValue, Op.getOperand(2));
16367 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
16368 }
16369
16370 // Check for non-constant or out of range lane.
16371 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
16372 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
16373 return SDValue();
16374
16375 return Op;
16376}
16377
16378SDValue
16379AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
16380 SelectionDAG &DAG) const {
16381 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
16382 EVT VT = Op.getOperand(0).getValueType();
16383
16384 if (VT.getScalarType() == MVT::i1) {
16385 // We can't directly extract from an SVE predicate; extend it first.
16386 // (This isn't the only possible lowering, but it's straightforward.)
16387 EVT VectorVT = getPromotedVTForPredicate(VT);
16388 SDLoc DL(Op);
16389 SDValue Extend =
16390 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
16391 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
16392 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
16393 Extend, Op.getOperand(1));
16394 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
16395 }
16396
16397 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16398 return LowerFixedLengthExtractVectorElt(Op, DAG);
16399
16400 // Check for non-constant or out of range lane.
16401 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16402 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
16403 return SDValue();
16404
16405 // Insertion/extraction are legal for V128 types.
16406 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
16407 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
16408 VT == MVT::v8f16 || VT == MVT::v8bf16)
16409 return Op;
16410
16411 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
16412 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
16413 VT != MVT::v4bf16)
16414 return SDValue();
16415
16416 // For V64 types, we perform extraction by expanding the value
16417 // to a V128 type and perform the extraction on that.
16418 SDLoc DL(Op);
16419 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
16420 EVT WideTy = WideVec.getValueType();
16421
16422 EVT ExtrTy = WideTy.getVectorElementType();
16423 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
16424 ExtrTy = MVT::i32;
16425
16426 // For extractions, we just return the result directly.
16427 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
16428 Op.getOperand(1));
16429}
16430
16431SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
16432 SelectionDAG &DAG) const {
16433 EVT VT = Op.getValueType();
16435 "Only cases that extract a fixed length vector are supported!");
16436 EVT InVT = Op.getOperand(0).getValueType();
16437
16438 // If we don't have legal types yet, do nothing
16439 if (!isTypeLegal(InVT))
16440 return SDValue();
16441
16442 if (InVT.is128BitVector()) {
16443 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
16444 unsigned Idx = Op.getConstantOperandVal(1);
16445
16446 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
16447 if (Idx == 0)
16448 return Op;
16449
16450 // If this is extracting the upper 64-bits of a 128-bit vector, we match
16451 // that directly.
16452 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
16453 return Op;
16454 }
16455
16456 if (InVT.isScalableVector() ||
16457 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
16458 SDLoc DL(Op);
16459 SDValue Vec = Op.getOperand(0);
16460 SDValue Idx = Op.getOperand(1);
16461
16462 EVT PackedVT = getPackedSVEVectorVT(InVT.getVectorElementType());
16463 if (PackedVT != InVT) {
16464 // Pack input into the bottom part of an SVE register and try again.
16465 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
16466 DAG.getUNDEF(PackedVT), Vec,
16467 DAG.getVectorIdxConstant(0, DL));
16468 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
16469 }
16470
16471 // This will get matched by custom code during ISelDAGToDAG.
16472 if (isNullConstant(Idx))
16473 return Op;
16474
16475 assert(InVT.isScalableVector() && "Unexpected vector type!");
16476 // Move requested subvector to the start of the vector and try again.
16477 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
16478 return convertFromScalableVector(DAG, VT, Splice);
16479 }
16480
16481 return SDValue();
16482}
16483
16484SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
16485 SelectionDAG &DAG) const {
16486 assert(Op.getValueType().isScalableVector() &&
16487 "Only expect to lower inserts into scalable vectors!");
16488
16489 EVT InVT = Op.getOperand(1).getValueType();
16490 unsigned Idx = Op.getConstantOperandVal(2);
16491
16492 SDValue Vec0 = Op.getOperand(0);
16493 SDValue Vec1 = Op.getOperand(1);
16494 SDLoc DL(Op);
16495 EVT VT = Op.getValueType();
16496
16497 if (InVT.isScalableVector()) {
16498 if (!isTypeLegal(VT))
16499 return SDValue();
16500
16501 // Break down insert_subvector into simpler parts.
16502 if (VT.getVectorElementType() == MVT::i1) {
16503 unsigned NumElts = VT.getVectorMinNumElements();
16504 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16505
16506 SDValue Lo, Hi;
16507 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16508 DAG.getVectorIdxConstant(0, DL));
16509 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16510 DAG.getVectorIdxConstant(NumElts / 2, DL));
16511 if (Idx < (NumElts / 2))
16512 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
16513 DAG.getVectorIdxConstant(Idx, DL));
16514 else
16515 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
16516 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
16517
16518 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16519 }
16520
16521 // We can select these directly.
16522 if (isTypeLegal(InVT) && Vec0.isUndef())
16523 return Op;
16524
16525 // Ensure the subvector is half the size of the main vector.
16526 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
16527 return SDValue();
16528
16529 // Here narrow and wide refers to the vector element types. After "casting"
16530 // both vectors must have the same bit length and so because the subvector
16531 // has fewer elements, those elements need to be bigger.
16532 EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
16533 EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
16534
16535 // NOP cast operands to the largest legal vector of the same element count.
16536 if (VT.isFloatingPoint()) {
16537 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
16538 Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
16539 } else {
16540 // Legal integer vectors are already their largest so Vec0 is fine as is.
16541 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
16542 Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
16543 }
16544
16545 // To replace the top/bottom half of vector V with vector SubV we widen the
16546 // preserved half of V, concatenate this to SubV (the order depending on the
16547 // half being replaced) and then narrow the result.
16548 SDValue Narrow;
16549 if (Idx == 0) {
16550 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
16551 HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
16552 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
16553 } else {
16554 assert(Idx == InVT.getVectorMinNumElements() &&
16555 "Invalid subvector index!");
16556 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
16557 LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
16558 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
16559 }
16560
16561 return getSVESafeBitCast(VT, Narrow, DAG);
16562 }
16563
16564 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
16565 // This will be matched by custom code during ISelDAGToDAG.
16566 if (Vec0.isUndef())
16567 return Op;
16568
16569 std::optional<unsigned> PredPattern =
16571 auto PredTy = VT.changeVectorElementType(MVT::i1);
16572 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
16573 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
16574 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
16575 }
16576
16577 return SDValue();
16578}
16579
16580static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
16581 if (Op.getOpcode() != AArch64ISD::DUP &&
16582 Op.getOpcode() != ISD::SPLAT_VECTOR &&
16583 Op.getOpcode() != ISD::BUILD_VECTOR)
16584 return false;
16585
16586 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
16587 !isAllConstantBuildVector(Op, SplatVal))
16588 return false;
16589
16590 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
16591 !isa<ConstantSDNode>(Op->getOperand(0)))
16592 return false;
16593
16594 SplatVal = Op->getConstantOperandVal(0);
16595 if (Op.getValueType().getVectorElementType() != MVT::i64)
16596 SplatVal = (int32_t)SplatVal;
16597
16598 Negated = false;
16599 if (isPowerOf2_64(SplatVal))
16600 return true;
16601
16602 Negated = true;
16603 if (isPowerOf2_64(-SplatVal)) {
16604 SplatVal = -SplatVal;
16605 return true;
16606 }
16607
16608 return false;
16609}
16610
16611SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
16612 EVT VT = Op.getValueType();
16613 SDLoc DL(Op);
16614
16615 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
16616 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
16617
16618 assert(VT.isScalableVector() && "Expected a scalable vector.");
16619
16620 bool Signed = Op.getOpcode() == ISD::SDIV;
16621 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
16622
16623 bool Negated;
16624 uint64_t SplatVal;
16625 // NOTE: SRAD cannot be used to represent sdiv-by-one.
16626 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) &&
16627 SplatVal > 1) {
16629 SDValue Res =
16630 DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
16631 DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32));
16632 if (Negated)
16633 Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
16634
16635 return Res;
16636 }
16637
16638 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
16639 return LowerToPredicatedOp(Op, DAG, PredOpcode);
16640
16641 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
16642 // operations, and truncate the result.
16643 EVT WidenedVT;
16644 if (VT == MVT::nxv16i8)
16645 WidenedVT = MVT::nxv8i16;
16646 else if (VT == MVT::nxv8i16)
16647 WidenedVT = MVT::nxv4i32;
16648 else
16649 llvm_unreachable("Unexpected Custom DIV operation");
16650
16651 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
16652 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
16653 SDValue Op0Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(0));
16654 SDValue Op1Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(1));
16655 SDValue Op0Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(0));
16656 SDValue Op1Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(1));
16657 SDValue ResultLo = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Lo, Op1Lo);
16658 SDValue ResultHi = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Hi, Op1Hi);
16659 SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultLo);
16660 SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultHi);
16661 return DAG.getNode(AArch64ISD::UZP1, DL, VT, ResultLoCast, ResultHiCast);
16662}
16663
16664bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
16665 EVT VT, unsigned DefinedValues) const {
16666 if (!Subtarget->isNeonAvailable())
16667 return false;
16669}
16670
16672 // Currently no fixed length shuffles that require SVE are legal.
16673 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16674 return false;
16675
16676 if (VT.getVectorNumElements() == 4 &&
16677 (VT.is128BitVector() || VT.is64BitVector())) {
16678 unsigned Cost = getPerfectShuffleCost(M);
16679 if (Cost <= 1)
16680 return true;
16681 }
16682
16683 bool DummyBool;
16684 int DummyInt;
16685 unsigned DummyUnsigned;
16686
16687 unsigned EltSize = VT.getScalarSizeInBits();
16688 unsigned NumElts = VT.getVectorNumElements();
16690 isREVMask(M, EltSize, NumElts, 64) ||
16691 isREVMask(M, EltSize, NumElts, 32) ||
16692 isREVMask(M, EltSize, NumElts, 16) ||
16693 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
16694 isSingletonEXTMask(M, VT, DummyUnsigned) ||
16695 isTRNMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
16696 isUZPMask(M, NumElts, DummyUnsigned) ||
16697 isZIPMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
16698 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
16699 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
16700 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
16701 isINSMask(M, NumElts, DummyBool, DummyInt) ||
16702 isConcatMask(M, VT, VT.getSizeInBits() == 128));
16703}
16704
16706 EVT VT) const {
16707 // Just delegate to the generic legality, clear masks aren't special.
16708 return isShuffleMaskLegal(M, VT);
16709}
16710
16711/// getVShiftImm - Check if this is a valid build_vector for the immediate
16712/// operand of a vector shift operation, where all the elements of the
16713/// build_vector must have the same constant integer value.
16714static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
16715 // Ignore bit_converts.
16716 while (Op.getOpcode() == ISD::BITCAST)
16717 Op = Op.getOperand(0);
16719 APInt SplatBits, SplatUndef;
16720 unsigned SplatBitSize;
16721 bool HasAnyUndefs;
16722 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
16723 HasAnyUndefs, ElementBits) ||
16724 SplatBitSize > ElementBits)
16725 return false;
16726 Cnt = SplatBits.getSExtValue();
16727 return true;
16728}
16729
16730/// isVShiftLImm - Check if this is a valid build_vector for the immediate
16731/// operand of a vector shift left operation. That value must be in the range:
16732/// 0 <= Value < ElementBits for a left shift; or
16733/// 0 <= Value <= ElementBits for a long left shift.
16734static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
16735 assert(VT.isVector() && "vector shift count is not a vector type");
16736 int64_t ElementBits = VT.getScalarSizeInBits();
16737 if (!getVShiftImm(Op, ElementBits, Cnt))
16738 return false;
16739 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
16740}
16741
16742/// isVShiftRImm - Check if this is a valid build_vector for the immediate
16743/// operand of a vector shift right operation. The value must be in the range:
16744/// 1 <= Value <= ElementBits for a right shift; or
16745static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
16746 assert(VT.isVector() && "vector shift count is not a vector type");
16747 int64_t ElementBits = VT.getScalarSizeInBits();
16748 if (!getVShiftImm(Op, ElementBits, Cnt))
16749 return false;
16750 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
16751}
16752
16753SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
16754 SelectionDAG &DAG) const {
16755 EVT VT = Op.getValueType();
16756
16757 if (VT.getScalarType() == MVT::i1) {
16758 // Lower i1 truncate to `(x & 1) != 0`.
16759 SDLoc DL(Op);
16760 EVT OpVT = Op.getOperand(0).getValueType();
16761 SDValue Zero = DAG.getConstant(0, DL, OpVT);
16762 SDValue One = DAG.getConstant(1, DL, OpVT);
16763 SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Op.getOperand(0), One);
16764 return DAG.getSetCC(DL, VT, And, Zero, ISD::SETNE);
16765 }
16766
16767 if (!VT.isVector() || VT.isScalableVector())
16768 return SDValue();
16769
16770 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16771 !Subtarget->isNeonAvailable()))
16772 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
16773
16774 return SDValue();
16775}
16776
16777// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
16778// possibly a truncated type, it tells how many bits of the value are to be
16779// used.
16781 SelectionDAG &DAG,
16782 unsigned &ShiftValue,
16783 SDValue &RShOperand) {
16784 if (Shift->getOpcode() != ISD::SRL)
16785 return false;
16786
16787 EVT VT = Shift.getValueType();
16788 assert(VT.isScalableVT());
16789
16790 auto ShiftOp1 =
16792 if (!ShiftOp1)
16793 return false;
16794
16795 ShiftValue = ShiftOp1->getZExtValue();
16796 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
16797 return false;
16798
16799 SDValue Add = Shift->getOperand(0);
16800 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
16801 return false;
16802
16804 "ResVT must be truncated or same type as the shift.");
16805 // Check if an overflow can lead to incorrect results.
16806 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
16807 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
16808 return false;
16809
16810 auto AddOp1 =
16812 if (!AddOp1)
16813 return false;
16814 uint64_t AddValue = AddOp1->getZExtValue();
16815 if (AddValue != 1ULL << (ShiftValue - 1))
16816 return false;
16817
16818 RShOperand = Add->getOperand(0);
16819 return true;
16820}
16821
16822SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
16823 SelectionDAG &DAG) const {
16824 EVT VT = Op.getValueType();
16825 SDLoc DL(Op);
16826 int64_t Cnt;
16827
16828 if (!Op.getOperand(1).getValueType().isVector())
16829 return Op;
16830 unsigned EltSize = VT.getScalarSizeInBits();
16831
16832 switch (Op.getOpcode()) {
16833 case ISD::SHL:
16834 if (VT.isScalableVector() ||
16835 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16836 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
16837
16838 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
16839 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
16840 DAG.getTargetConstant(Cnt, DL, MVT::i32));
16841 return DAG.getNode(
16843 DAG.getTargetConstant(Intrinsic::aarch64_neon_ushl, DL, MVT::i32),
16844 Op.getOperand(0), Op.getOperand(1));
16845 case ISD::SRA:
16846 case ISD::SRL:
16847 if (VT.isScalableVector() &&
16848 (Subtarget->hasSVE2() ||
16849 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
16850 SDValue RShOperand;
16851 unsigned ShiftValue;
16852 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
16853 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
16854 getPredicateForVector(DAG, DL, VT), RShOperand,
16855 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
16856 }
16857
16858 if (VT.isScalableVector() ||
16859 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
16860 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
16861 : AArch64ISD::SRL_PRED;
16862 return LowerToPredicatedOp(Op, DAG, Opc);
16863 }
16864
16865 // Right shift immediate
16866 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
16867 unsigned Opc =
16868 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
16869 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
16870 DAG.getTargetConstant(Cnt, DL, MVT::i32),
16871 Op->getFlags());
16872 }
16873
16874 // Right shift register. Note, there is not a shift right register
16875 // instruction, but the shift left register instruction takes a signed
16876 // value, where negative numbers specify a right shift.
16877 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
16878 : Intrinsic::aarch64_neon_ushl;
16879 // negate the shift amount
16880 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
16881 Op.getOperand(1));
16882 SDValue NegShiftLeft =
16884 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
16885 NegShift);
16886 return NegShiftLeft;
16887 }
16888
16889 llvm_unreachable("unexpected shift opcode");
16890}
16891
16892SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
16893 SelectionDAG &DAG) const {
16894 if (Op.getValueType().isScalableVector())
16895 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
16896
16897 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16898 !Subtarget->isNeonAvailable()))
16899 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
16900
16901 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
16902 SDValue LHS = Op.getOperand(0);
16903 SDValue RHS = Op.getOperand(1);
16904 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
16905 SDLoc DL(Op);
16906
16907 if (LHS.getValueType().getVectorElementType().isInteger())
16908 return Op;
16909
16910 assert(((!Subtarget->hasFullFP16() &&
16911 LHS.getValueType().getVectorElementType() != MVT::f16) ||
16912 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
16913 LHS.getValueType().getVectorElementType() != MVT::f128) &&
16914 "Unexpected type!");
16915
16916 // Lower isnan(x) | isnan(never-nan) to x != x.
16917 // Lower !isnan(x) & !isnan(never-nan) to x == x.
16918 if (CC == ISD::SETUO || CC == ISD::SETO) {
16919 bool OneNaN = false;
16920 if (LHS == RHS) {
16921 OneNaN = true;
16922 } else if (DAG.isKnownNeverNaN(RHS)) {
16923 OneNaN = true;
16924 RHS = LHS;
16925 } else if (DAG.isKnownNeverNaN(LHS)) {
16926 OneNaN = true;
16927 LHS = RHS;
16928 }
16929 if (OneNaN) {
16930 CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
16931 }
16932 }
16933
16934 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
16935 // clean. Some of them require two branches to implement.
16936 AArch64CC::CondCode CC1, CC2;
16937 bool ShouldInvert;
16938 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
16939
16940 bool NoNaNs =
16941 getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
16942 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, DL, DAG);
16943 if (!Cmp.getNode())
16944 return SDValue();
16945
16946 if (CC2 != AArch64CC::AL) {
16947 SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, DL, DAG);
16948 if (!Cmp2.getNode())
16949 return SDValue();
16950
16951 Cmp = DAG.getNode(ISD::OR, DL, CmpVT, Cmp, Cmp2);
16952 }
16953
16954 Cmp = DAG.getSExtOrTrunc(Cmp, DL, Op.getValueType());
16955
16956 if (ShouldInvert)
16957 Cmp = DAG.getNOT(DL, Cmp, Cmp.getValueType());
16958
16959 return Cmp;
16960}
16961
16962static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
16963 SelectionDAG &DAG) {
16964 SDValue VecOp = ScalarOp.getOperand(0);
16965 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
16966 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
16967 DAG.getConstant(0, DL, MVT::i64));
16968}
16969
16970static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
16971 SDLoc DL, SelectionDAG &DAG) {
16972 unsigned ScalarOpcode;
16973 switch (Opcode) {
16974 case ISD::VECREDUCE_AND:
16975 ScalarOpcode = ISD::AND;
16976 break;
16977 case ISD::VECREDUCE_OR:
16978 ScalarOpcode = ISD::OR;
16979 break;
16980 case ISD::VECREDUCE_XOR:
16981 ScalarOpcode = ISD::XOR;
16982 break;
16983 default:
16984 llvm_unreachable("Expected bitwise vector reduction");
16985 return SDValue();
16986 }
16987
16988 EVT VecVT = Vec.getValueType();
16989 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
16990 "Expected power-of-2 length vector");
16991
16992 EVT ElemVT = VecVT.getVectorElementType();
16993
16994 SDValue Result;
16995 unsigned NumElems = VecVT.getVectorNumElements();
16996
16997 // Special case for boolean reductions
16998 if (ElemVT == MVT::i1) {
16999 // Split large vectors into smaller ones
17000 if (NumElems > 16) {
17001 SDValue Lo, Hi;
17002 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
17003 EVT HalfVT = Lo.getValueType();
17004 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
17005 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
17006 }
17007
17008 // Results of setcc operations get widened to 128 bits if their input
17009 // operands are 128 bits wide, otherwise vectors that are less than 64 bits
17010 // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
17011 // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
17012 // size leads to the best codegen, since e.g. setcc results might need to be
17013 // truncated otherwise.
17014 unsigned ExtendedWidth = 64;
17015 if (Vec.getOpcode() == ISD::SETCC &&
17016 Vec.getOperand(0).getValueSizeInBits() >= 128) {
17017 ExtendedWidth = 128;
17018 }
17019 EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
17020
17021 // any_ext doesn't work with umin/umax, so only use it for uadd.
17022 unsigned ExtendOp =
17023 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
17024 SDValue Extended = DAG.getNode(
17025 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
17026 // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
17027 // in that case we bitcast the sign extended values from v2i64 to v4i32
17028 // before reduction for optimal code generation.
17029 if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
17030 NumElems == 2 && ExtendedWidth == 128) {
17031 Extended = DAG.getBitcast(MVT::v4i32, Extended);
17032 ExtendedVT = MVT::i32;
17033 }
17034 switch (ScalarOpcode) {
17035 case ISD::AND:
17036 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
17037 break;
17038 case ISD::OR:
17039 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
17040 break;
17041 case ISD::XOR:
17042 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
17043 break;
17044 default:
17045 llvm_unreachable("Unexpected Opcode");
17046 }
17047
17048 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
17049 } else {
17050 // Iteratively split the vector in half and combine using the bitwise
17051 // operation until it fits in a 64 bit register.
17052 while (VecVT.getSizeInBits() > 64) {
17053 SDValue Lo, Hi;
17054 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
17055 VecVT = Lo.getValueType();
17056 NumElems = VecVT.getVectorNumElements();
17057 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
17058 }
17059
17060 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
17061
17062 // Do the remaining work on a scalar since it allows the code generator to
17063 // combine the shift and bitwise operation into one instruction and since
17064 // integer instructions can have higher throughput than vector instructions.
17065 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
17066
17067 // Iteratively combine the lower and upper halves of the scalar using the
17068 // bitwise operation, halving the relevant region of the scalar in each
17069 // iteration, until the relevant region is just one element of the original
17070 // vector.
17071 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
17072 SDValue ShiftAmount =
17073 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
17074 SDValue Shifted =
17075 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
17076 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
17077 }
17078
17079 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
17080 }
17081
17082 return DAG.getAnyExtOrTrunc(Result, DL, VT);
17083}
17084
17085SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
17086 SelectionDAG &DAG) const {
17087 SDValue Src = Op.getOperand(0);
17088 EVT SrcVT = Src.getValueType();
17089
17090 // Scalarize v2f16 to turn it into a faddp. This will be more efficient than
17091 // widening by inserting zeroes.
17092 if (Subtarget->hasFullFP16() && Op.getOpcode() == ISD::VECREDUCE_FADD &&
17093 SrcVT == MVT::v2f16) {
17094 SDLoc DL(Op);
17095 return DAG.getNode(ISD::FADD, DL, MVT::f16,
17096 DAG.getExtractVectorElt(DL, MVT::f16, Src, 0),
17097 DAG.getExtractVectorElt(DL, MVT::f16, Src, 1));
17098 }
17099
17100 // Try to lower fixed length reductions to SVE.
17101 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
17102 Op.getOpcode() == ISD::VECREDUCE_AND ||
17103 Op.getOpcode() == ISD::VECREDUCE_OR ||
17104 Op.getOpcode() == ISD::VECREDUCE_XOR ||
17105 Op.getOpcode() == ISD::VECREDUCE_FADD ||
17106 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
17107 SrcVT.getVectorElementType() == MVT::i64);
17108 if (SrcVT.isScalableVector() ||
17110 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
17111
17112 if (SrcVT.getVectorElementType() == MVT::i1)
17113 return LowerPredReductionToSVE(Op, DAG);
17114
17115 switch (Op.getOpcode()) {
17116 case ISD::VECREDUCE_ADD:
17117 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
17118 case ISD::VECREDUCE_AND:
17119 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
17120 case ISD::VECREDUCE_OR:
17121 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
17122 case ISD::VECREDUCE_SMAX:
17123 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
17124 case ISD::VECREDUCE_SMIN:
17125 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
17126 case ISD::VECREDUCE_UMAX:
17127 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
17128 case ISD::VECREDUCE_UMIN:
17129 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
17130 case ISD::VECREDUCE_XOR:
17131 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
17132 case ISD::VECREDUCE_FADD:
17133 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
17134 case ISD::VECREDUCE_FMAX:
17135 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
17136 case ISD::VECREDUCE_FMIN:
17137 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
17138 case ISD::VECREDUCE_FMAXIMUM:
17139 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
17140 case ISD::VECREDUCE_FMINIMUM:
17141 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
17142 default:
17143 llvm_unreachable("Unhandled fixed length reduction");
17144 }
17145 }
17146
17147 // Lower NEON reductions.
17148 SDLoc DL(Op);
17149 switch (Op.getOpcode()) {
17150 case ISD::VECREDUCE_AND:
17151 case ISD::VECREDUCE_OR:
17152 case ISD::VECREDUCE_XOR:
17153 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
17154 Op.getValueType(), DL, DAG);
17155 case ISD::VECREDUCE_ADD:
17156 return getReductionSDNode(AArch64ISD::UADDV, DL, Op, DAG);
17157 case ISD::VECREDUCE_SMAX:
17158 return getReductionSDNode(AArch64ISD::SMAXV, DL, Op, DAG);
17159 case ISD::VECREDUCE_SMIN:
17160 return getReductionSDNode(AArch64ISD::SMINV, DL, Op, DAG);
17161 case ISD::VECREDUCE_UMAX:
17162 return getReductionSDNode(AArch64ISD::UMAXV, DL, Op, DAG);
17163 case ISD::VECREDUCE_UMIN:
17164 return getReductionSDNode(AArch64ISD::UMINV, DL, Op, DAG);
17165 default:
17166 llvm_unreachable("Unhandled reduction");
17167 }
17168}
17169
17170SDValue AArch64TargetLowering::LowerVECREDUCE_MUL(SDValue Op,
17171 SelectionDAG &DAG) const {
17172 SDLoc DL(Op);
17173 SDValue Src = Op.getOperand(0);
17174 EVT SrcVT = Src.getValueType();
17175 assert(SrcVT.isScalableVector() && "Unexpected operand type!");
17176
17177 SDVTList SrcVTs = DAG.getVTList(SrcVT, SrcVT);
17178 unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode());
17179 SDValue Identity = DAG.getNeutralElement(BaseOpc, DL, SrcVT, Op->getFlags());
17180
17181 // Whilst we don't know the size of the vector we do know the maximum size so
17182 // can perform a tree reduction with an identity vector, which means once we
17183 // arrive at the result the remaining stages (when the vector is smaller than
17184 // the maximum) have no affect.
17185
17187 unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements());
17188
17189 for (unsigned I = 0; I < Stages; ++I) {
17190 Src = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, SrcVTs, Src, Identity);
17191 Src = DAG.getNode(BaseOpc, DL, SrcVT, Src.getValue(0), Src.getValue(1));
17192 }
17193
17194 return DAG.getExtractVectorElt(DL, Op.getValueType(), Src, 0);
17195}
17196
17197SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
17198 SelectionDAG &DAG) const {
17199 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
17200 // No point replacing if we don't have the relevant instruction/libcall anyway
17201 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
17202 return SDValue();
17203
17204 // LSE has an atomic load-clear instruction, but not a load-and.
17205 SDLoc DL(Op);
17206 MVT VT = Op.getSimpleValueType();
17207 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
17208 SDValue RHS = Op.getOperand(2);
17209 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
17210 RHS = DAG.getNode(ISD::XOR, DL, VT, DAG.getAllOnesConstant(DL, VT), RHS);
17211 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, DL, AN->getMemoryVT(),
17212 Op.getOperand(0), Op.getOperand(1), RHS,
17213 AN->getMemOperand());
17214}
17215
17216SDValue
17217AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
17218 SelectionDAG &DAG) const {
17219
17220 SDLoc DL(Op);
17221 // Get the inputs.
17222 SDNode *Node = Op.getNode();
17223 SDValue Chain = Op.getOperand(0);
17224 SDValue Size = Op.getOperand(1);
17225 MaybeAlign Align =
17226 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
17227 EVT VT = Node->getValueType(0);
17228
17230 "no-stack-arg-probe")) {
17231 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
17232 Chain = SP.getValue(1);
17233 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
17234 if (Align)
17235 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
17236 DAG.getSignedConstant(-Align->value(), DL, VT));
17237 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
17238 SDValue Ops[2] = {SP, Chain};
17239 return DAG.getMergeValues(Ops, DL);
17240 }
17241
17242 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
17243
17244 EVT PtrVT = getPointerTy(DAG.getDataLayout());
17245 SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
17246 PtrVT, 0);
17247
17248 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
17249 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
17250 if (Subtarget->hasCustomCallingConv())
17251 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
17252
17253 Size = DAG.getNode(ISD::SRL, DL, MVT::i64, Size,
17254 DAG.getConstant(4, DL, MVT::i64));
17255 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X15, Size, SDValue());
17256 Chain =
17257 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
17258 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
17259 DAG.getRegisterMask(Mask), Chain.getValue(1));
17260 // To match the actual intent better, we should read the output from X15 here
17261 // again (instead of potentially spilling it to the stack), but rereading Size
17262 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
17263 // here.
17264
17265 Size = DAG.getNode(ISD::SHL, DL, MVT::i64, Size,
17266 DAG.getConstant(4, DL, MVT::i64));
17267
17268 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
17269 Chain = SP.getValue(1);
17270 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
17271 if (Align)
17272 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
17273 DAG.getSignedConstant(-Align->value(), DL, VT));
17274 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
17275
17276 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL);
17277
17278 SDValue Ops[2] = {SP, Chain};
17279 return DAG.getMergeValues(Ops, DL);
17280}
17281
17282SDValue
17283AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
17284 SelectionDAG &DAG) const {
17285 // Get the inputs.
17286 SDNode *Node = Op.getNode();
17287 SDValue Chain = Op.getOperand(0);
17288 SDValue Size = Op.getOperand(1);
17289
17290 MaybeAlign Align =
17291 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
17292 SDLoc DL(Op);
17293 EVT VT = Node->getValueType(0);
17294
17295 // Construct the new SP value in a GPR.
17296 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
17297 Chain = SP.getValue(1);
17298 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
17299 if (Align)
17300 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
17301 DAG.getSignedConstant(-Align->value(), DL, VT));
17302
17303 // Set the real SP to the new value with a probing loop.
17304 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, DL, MVT::Other, Chain, SP);
17305 SDValue Ops[2] = {SP, Chain};
17306 return DAG.getMergeValues(Ops, DL);
17307}
17308
17309SDValue
17310AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
17311 SelectionDAG &DAG) const {
17312 MachineFunction &MF = DAG.getMachineFunction();
17313
17314 if (Subtarget->isTargetWindows())
17315 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
17316 else if (hasInlineStackProbe(MF))
17317 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
17318 else
17319 return SDValue();
17320}
17321
17322SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
17323 unsigned NewOp) const {
17324 if (Subtarget->hasSVE2())
17325 return LowerToPredicatedOp(Op, DAG, NewOp);
17326
17327 // Default to expand.
17328 return SDValue();
17329}
17330
17331SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
17332 SelectionDAG &DAG) const {
17333 EVT VT = Op.getValueType();
17334 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
17335
17336 SDLoc DL(Op);
17337 APInt MulImm = Op.getConstantOperandAPInt(0);
17338 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
17339 VT);
17340}
17341
17342/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
17343template <unsigned NumVecs>
17344static bool
17348 // Retrieve EC from first vector argument.
17349 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
17351#ifndef NDEBUG
17352 // Check the assumption that all input vectors are the same type.
17353 for (unsigned I = 0; I < NumVecs; ++I)
17354 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
17355 "Invalid type.");
17356#endif
17357 // memVT is `NumVecs * VT`.
17359 EC * NumVecs);
17360 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
17361 Info.offset = 0;
17362 Info.align.reset();
17364 return true;
17365}
17366
17367/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
17368/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
17369/// specified in the intrinsic calls.
17371 const CallBase &I,
17372 MachineFunction &MF,
17373 unsigned Intrinsic) const {
17374 auto &DL = I.getDataLayout();
17375 switch (Intrinsic) {
17376 case Intrinsic::aarch64_sve_st2:
17377 return setInfoSVEStN<2>(*this, DL, Info, I);
17378 case Intrinsic::aarch64_sve_st3:
17379 return setInfoSVEStN<3>(*this, DL, Info, I);
17380 case Intrinsic::aarch64_sve_st4:
17381 return setInfoSVEStN<4>(*this, DL, Info, I);
17382 case Intrinsic::aarch64_neon_ld2:
17383 case Intrinsic::aarch64_neon_ld3:
17384 case Intrinsic::aarch64_neon_ld4:
17385 case Intrinsic::aarch64_neon_ld1x2:
17386 case Intrinsic::aarch64_neon_ld1x3:
17387 case Intrinsic::aarch64_neon_ld1x4: {
17388 Info.opc = ISD::INTRINSIC_W_CHAIN;
17389 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
17390 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
17391 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17392 Info.offset = 0;
17393 Info.align.reset();
17394 // volatile loads with NEON intrinsics not supported
17395 Info.flags = MachineMemOperand::MOLoad;
17396 return true;
17397 }
17398 case Intrinsic::aarch64_neon_ld2lane:
17399 case Intrinsic::aarch64_neon_ld3lane:
17400 case Intrinsic::aarch64_neon_ld4lane:
17401 case Intrinsic::aarch64_neon_ld2r:
17402 case Intrinsic::aarch64_neon_ld3r:
17403 case Intrinsic::aarch64_neon_ld4r: {
17404 Info.opc = ISD::INTRINSIC_W_CHAIN;
17405 // ldx return struct with the same vec type
17406 Type *RetTy = I.getType();
17407 auto *StructTy = cast<StructType>(RetTy);
17408 unsigned NumElts = StructTy->getNumElements();
17409 Type *VecTy = StructTy->getElementType(0);
17410 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
17411 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
17412 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17413 Info.offset = 0;
17414 Info.align.reset();
17415 // volatile loads with NEON intrinsics not supported
17416 Info.flags = MachineMemOperand::MOLoad;
17417 return true;
17418 }
17419 case Intrinsic::aarch64_neon_st2:
17420 case Intrinsic::aarch64_neon_st3:
17421 case Intrinsic::aarch64_neon_st4:
17422 case Intrinsic::aarch64_neon_st1x2:
17423 case Intrinsic::aarch64_neon_st1x3:
17424 case Intrinsic::aarch64_neon_st1x4: {
17425 Info.opc = ISD::INTRINSIC_VOID;
17426 unsigned NumElts = 0;
17427 for (const Value *Arg : I.args()) {
17428 Type *ArgTy = Arg->getType();
17429 if (!ArgTy->isVectorTy())
17430 break;
17431 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
17432 }
17433 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
17434 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17435 Info.offset = 0;
17436 Info.align.reset();
17437 // volatile stores with NEON intrinsics not supported
17438 Info.flags = MachineMemOperand::MOStore;
17439 return true;
17440 }
17441 case Intrinsic::aarch64_neon_st2lane:
17442 case Intrinsic::aarch64_neon_st3lane:
17443 case Intrinsic::aarch64_neon_st4lane: {
17444 Info.opc = ISD::INTRINSIC_VOID;
17445 unsigned NumElts = 0;
17446 // all the vector type is same
17447 Type *VecTy = I.getArgOperand(0)->getType();
17448 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
17449
17450 for (const Value *Arg : I.args()) {
17451 Type *ArgTy = Arg->getType();
17452 if (!ArgTy->isVectorTy())
17453 break;
17454 NumElts += 1;
17455 }
17456
17457 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
17458 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17459 Info.offset = 0;
17460 Info.align.reset();
17461 // volatile stores with NEON intrinsics not supported
17462 Info.flags = MachineMemOperand::MOStore;
17463 return true;
17464 }
17465 case Intrinsic::aarch64_ldaxr:
17466 case Intrinsic::aarch64_ldxr: {
17467 Type *ValTy = I.getParamElementType(0);
17468 Info.opc = ISD::INTRINSIC_W_CHAIN;
17469 Info.memVT = MVT::getVT(ValTy);
17470 Info.ptrVal = I.getArgOperand(0);
17471 Info.offset = 0;
17472 Info.align = DL.getABITypeAlign(ValTy);
17474 return true;
17475 }
17476 case Intrinsic::aarch64_stlxr:
17477 case Intrinsic::aarch64_stxr: {
17478 Type *ValTy = I.getParamElementType(1);
17479 Info.opc = ISD::INTRINSIC_W_CHAIN;
17480 Info.memVT = MVT::getVT(ValTy);
17481 Info.ptrVal = I.getArgOperand(1);
17482 Info.offset = 0;
17483 Info.align = DL.getABITypeAlign(ValTy);
17485 return true;
17486 }
17487 case Intrinsic::aarch64_ldaxp:
17488 case Intrinsic::aarch64_ldxp:
17489 Info.opc = ISD::INTRINSIC_W_CHAIN;
17490 Info.memVT = MVT::i128;
17491 Info.ptrVal = I.getArgOperand(0);
17492 Info.offset = 0;
17493 Info.align = Align(16);
17495 return true;
17496 case Intrinsic::aarch64_stlxp:
17497 case Intrinsic::aarch64_stxp:
17498 Info.opc = ISD::INTRINSIC_W_CHAIN;
17499 Info.memVT = MVT::i128;
17500 Info.ptrVal = I.getArgOperand(2);
17501 Info.offset = 0;
17502 Info.align = Align(16);
17504 return true;
17505 case Intrinsic::aarch64_sve_ldnt1: {
17506 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
17507 Info.opc = ISD::INTRINSIC_W_CHAIN;
17508 Info.memVT = MVT::getVT(I.getType());
17509 Info.ptrVal = I.getArgOperand(1);
17510 Info.offset = 0;
17511 Info.align = DL.getABITypeAlign(ElTy);
17513 return true;
17514 }
17515 case Intrinsic::aarch64_sve_stnt1: {
17516 Type *ElTy =
17517 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
17518 Info.opc = ISD::INTRINSIC_W_CHAIN;
17519 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
17520 Info.ptrVal = I.getArgOperand(2);
17521 Info.offset = 0;
17522 Info.align = DL.getABITypeAlign(ElTy);
17524 return true;
17525 }
17526 case Intrinsic::aarch64_mops_memset_tag: {
17527 Value *Dst = I.getArgOperand(0);
17528 Value *Val = I.getArgOperand(1);
17529 Info.opc = ISD::INTRINSIC_W_CHAIN;
17530 Info.memVT = MVT::getVT(Val->getType());
17531 Info.ptrVal = Dst;
17532 Info.offset = 0;
17533 Info.align = I.getParamAlign(0).valueOrOne();
17534 Info.flags = MachineMemOperand::MOStore;
17535 // The size of the memory being operated on is unknown at this point
17536 Info.size = MemoryLocation::UnknownSize;
17537 return true;
17538 }
17539 default:
17540 break;
17541 }
17542
17543 return false;
17544}
17545
17547 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
17548 std::optional<unsigned> ByteOffset) const {
17549 // TODO: This may be worth removing. Check regression tests for diffs.
17550 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT,
17551 ByteOffset))
17552 return false;
17553
17554 // If we're reducing the load width in order to avoid having to use an extra
17555 // instruction to do extension then it's probably a good idea.
17556 if (ExtTy != ISD::NON_EXTLOAD)
17557 return true;
17558 // Don't reduce load width if it would prevent us from combining a shift into
17559 // the offset.
17560 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
17561 assert(Mem);
17562 const SDValue &Base = Mem->getBasePtr();
17563 if (Base.getOpcode() == ISD::ADD &&
17564 Base.getOperand(1).getOpcode() == ISD::SHL &&
17565 Base.getOperand(1).hasOneUse() &&
17566 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
17567 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
17568 if (Mem->getMemoryVT().isScalableVector())
17569 return false;
17570 // The shift can be combined if it matches the size of the value being
17571 // loaded (and so reducing the width would make it not match).
17572 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
17573 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
17574 if (ShiftAmount == Log2_32(LoadBytes))
17575 return false;
17576 }
17577 // We have no reason to disallow reducing the load width, so allow it.
17578 return true;
17579}
17580
17581// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
17583 EVT VT = Extend.getValueType();
17584 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
17585 SDValue Extract = Extend.getOperand(0);
17586 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
17587 Extract = Extract.getOperand(0);
17588 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
17589 EVT VecVT = Extract.getOperand(0).getValueType();
17590 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
17591 return false;
17592 }
17593 }
17594 return true;
17595}
17596
17597// Truncations from 64-bit GPR to 32-bit GPR is free.
17599 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17600 return false;
17601 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
17602 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
17603 return NumBits1 > NumBits2;
17604}
17606 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17607 return false;
17608 uint64_t NumBits1 = VT1.getFixedSizeInBits();
17609 uint64_t NumBits2 = VT2.getFixedSizeInBits();
17610 return NumBits1 > NumBits2;
17611}
17612
17613/// Check if it is profitable to hoist instruction in then/else to if.
17614/// Not profitable if I and it's user can form a FMA instruction
17615/// because we prefer FMSUB/FMADD.
17617 if (I->getOpcode() != Instruction::FMul)
17618 return true;
17619
17620 if (!I->hasOneUse())
17621 return true;
17622
17623 Instruction *User = I->user_back();
17624
17625 if (!(User->getOpcode() == Instruction::FSub ||
17626 User->getOpcode() == Instruction::FAdd))
17627 return true;
17628
17630 const Function *F = I->getFunction();
17631 const DataLayout &DL = F->getDataLayout();
17632 Type *Ty = User->getOperand(0)->getType();
17633
17634 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
17636 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17637 I->getFastMathFlags().allowContract()));
17638}
17639
17640// All 32-bit GPR operations implicitly zero the high-half of the corresponding
17641// 64-bit GPR.
17643 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17644 return false;
17645 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
17646 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
17647 return NumBits1 == 32 && NumBits2 == 64;
17648}
17650 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17651 return false;
17652 unsigned NumBits1 = VT1.getSizeInBits();
17653 unsigned NumBits2 = VT2.getSizeInBits();
17654 return NumBits1 == 32 && NumBits2 == 64;
17655}
17656
17658 EVT VT1 = Val.getValueType();
17659 if (isZExtFree(VT1, VT2)) {
17660 return true;
17661 }
17662
17663 if (Val.getOpcode() != ISD::LOAD)
17664 return false;
17665
17666 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
17667 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
17668 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
17669 VT1.getSizeInBits() <= 32);
17670}
17671
17672bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
17673 if (isa<FPExtInst>(Ext))
17674 return false;
17675
17676 // Vector types are not free.
17677 if (Ext->getType()->isVectorTy())
17678 return false;
17679
17680 for (const Use &U : Ext->uses()) {
17681 // The extension is free if we can fold it with a left shift in an
17682 // addressing mode or an arithmetic operation: add, sub, and cmp.
17683
17684 // Is there a shift?
17685 const Instruction *Instr = cast<Instruction>(U.getUser());
17686
17687 // Is this a constant shift?
17688 switch (Instr->getOpcode()) {
17689 case Instruction::Shl:
17690 if (!isa<ConstantInt>(Instr->getOperand(1)))
17691 return false;
17692 break;
17693 case Instruction::GetElementPtr: {
17694 gep_type_iterator GTI = gep_type_begin(Instr);
17695 auto &DL = Ext->getDataLayout();
17696 std::advance(GTI, U.getOperandNo()-1);
17697 Type *IdxTy = GTI.getIndexedType();
17698 // This extension will end up with a shift because of the scaling factor.
17699 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
17700 // Get the shift amount based on the scaling factor:
17701 // log2(sizeof(IdxTy)) - log2(8).
17702 if (IdxTy->isScalableTy())
17703 return false;
17704 uint64_t ShiftAmt =
17705 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
17706 3;
17707 // Is the constant foldable in the shift of the addressing mode?
17708 // I.e., shift amount is between 1 and 4 inclusive.
17709 if (ShiftAmt == 0 || ShiftAmt > 4)
17710 return false;
17711 break;
17712 }
17713 case Instruction::Trunc:
17714 // Check if this is a noop.
17715 // trunc(sext ty1 to ty2) to ty1.
17716 if (Instr->getType() == Ext->getOperand(0)->getType())
17717 continue;
17718 [[fallthrough]];
17719 default:
17720 return false;
17721 }
17722
17723 // At this point we can use the bfm family, so this extension is free
17724 // for that use.
17725 }
17726 return true;
17727}
17728
17729static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
17730 unsigned NumElts, bool IsLittleEndian,
17731 SmallVectorImpl<int> &Mask) {
17732 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
17733 return false;
17734
17735 assert(DstWidth % SrcWidth == 0 &&
17736 "TBL lowering is not supported for a conversion instruction with this "
17737 "source and destination element type.");
17738
17739 unsigned Factor = DstWidth / SrcWidth;
17740 unsigned MaskLen = NumElts * Factor;
17741
17742 Mask.clear();
17743 Mask.resize(MaskLen, NumElts);
17744
17745 unsigned SrcIndex = 0;
17746 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
17747 Mask[I] = SrcIndex++;
17748
17749 return true;
17750}
17751
17753 FixedVectorType *ZExtTy,
17754 FixedVectorType *DstTy,
17755 bool IsLittleEndian) {
17756 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17757 unsigned NumElts = SrcTy->getNumElements();
17758 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17759 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17760
17761 SmallVector<int> Mask;
17762 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
17763 return nullptr;
17764
17765 auto *FirstEltZero = Builder.CreateInsertElement(
17766 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17767 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17768 Result = Builder.CreateBitCast(Result, DstTy);
17769 if (DstTy != ZExtTy)
17770 Result = Builder.CreateZExt(Result, ZExtTy);
17771 return Result;
17772}
17773
17775 FixedVectorType *DstTy,
17776 bool IsLittleEndian) {
17777 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17778 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17779 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17780
17781 SmallVector<int> Mask;
17782 if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
17783 !IsLittleEndian, Mask))
17784 return nullptr;
17785
17786 auto *FirstEltZero = Builder.CreateInsertElement(
17787 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17788
17789 return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17790}
17791
17792static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
17793 IRBuilder<> Builder(TI);
17795 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
17796 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
17797 auto *DstTy = cast<FixedVectorType>(TI->getType());
17798 assert(SrcTy->getElementType()->isIntegerTy() &&
17799 "Non-integer type source vector element is not supported");
17800 assert(DstTy->getElementType()->isIntegerTy(8) &&
17801 "Unsupported destination vector element type");
17802 unsigned SrcElemTySz =
17803 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17804 unsigned DstElemTySz =
17805 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17806 assert((SrcElemTySz % DstElemTySz == 0) &&
17807 "Cannot lower truncate to tbl instructions for a source element size "
17808 "that is not divisible by the destination element size");
17809 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
17810 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
17811 "Unsupported source vector element type size");
17812 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
17813
17814 // Create a mask to choose every nth byte from the source vector table of
17815 // bytes to create the truncated destination vector, where 'n' is the truncate
17816 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
17817 // 0,8,16,..Y*8th bytes for the little-endian format
17819 for (int Itr = 0; Itr < 16; Itr++) {
17820 if (Itr < NumElements)
17821 MaskConst.push_back(Builder.getInt8(
17822 IsLittleEndian ? Itr * TruncFactor
17823 : Itr * TruncFactor + (TruncFactor - 1)));
17824 else
17825 MaskConst.push_back(Builder.getInt8(255));
17826 }
17827
17828 int MaxTblSz = 128 * 4;
17829 int MaxSrcSz = SrcElemTySz * NumElements;
17830 int ElemsPerTbl =
17831 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
17832 assert(ElemsPerTbl <= 16 &&
17833 "Maximum elements selected using TBL instruction cannot exceed 16!");
17834
17835 int ShuffleCount = 128 / SrcElemTySz;
17836 SmallVector<int> ShuffleLanes;
17837 for (int i = 0; i < ShuffleCount; ++i)
17838 ShuffleLanes.push_back(i);
17839
17840 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
17841 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
17842 // call TBL & save the result in a vector of TBL results for combining later.
17844 while (ShuffleLanes.back() < NumElements) {
17845 Parts.push_back(Builder.CreateBitCast(
17846 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
17847
17848 if (Parts.size() == 4) {
17849 Parts.push_back(ConstantVector::get(MaskConst));
17850 Results.push_back(
17851 Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
17852 Parts.clear();
17853 }
17854
17855 for (int i = 0; i < ShuffleCount; ++i)
17856 ShuffleLanes[i] += ShuffleCount;
17857 }
17858
17859 assert((Parts.empty() || Results.empty()) &&
17860 "Lowering trunc for vectors requiring different TBL instructions is "
17861 "not supported!");
17862 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
17863 // registers
17864 if (!Parts.empty()) {
17865 Intrinsic::ID TblID;
17866 switch (Parts.size()) {
17867 case 1:
17868 TblID = Intrinsic::aarch64_neon_tbl1;
17869 break;
17870 case 2:
17871 TblID = Intrinsic::aarch64_neon_tbl2;
17872 break;
17873 case 3:
17874 TblID = Intrinsic::aarch64_neon_tbl3;
17875 break;
17876 }
17877
17878 Parts.push_back(ConstantVector::get(MaskConst));
17879 Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
17880 }
17881
17882 // Extract the destination vector from TBL result(s) after combining them
17883 // where applicable. Currently, at most two TBLs are supported.
17884 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
17885 "more than 2 tbl instructions!");
17886 Value *FinalResult = Results[0];
17887 if (Results.size() == 1) {
17888 if (ElemsPerTbl < 16) {
17889 SmallVector<int> FinalMask(ElemsPerTbl);
17890 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17891 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
17892 }
17893 } else {
17894 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
17895 if (ElemsPerTbl < 16) {
17896 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
17897 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
17898 } else {
17899 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17900 }
17901 FinalResult =
17902 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
17903 }
17904
17905 TI->replaceAllUsesWith(FinalResult);
17906 TI->eraseFromParent();
17907}
17908
17910 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
17911 // shuffle_vector instructions are serialized when targeting SVE,
17912 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
17913 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
17914 return false;
17915
17916 // Try to optimize conversions using tbl. This requires materializing constant
17917 // index vectors, which can increase code size and add loads. Skip the
17918 // transform unless the conversion is in a loop block guaranteed to execute
17919 // and we are not optimizing for size.
17920 Function *F = I->getParent()->getParent();
17921 if (!L || L->getHeader() != I->getParent() || F->hasOptSize())
17922 return false;
17923
17924 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
17925 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
17926 if (!SrcTy || !DstTy)
17927 return false;
17928
17929 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
17930 // lowered to tbl instructions to insert the original i8 elements
17931 // into i8x lanes. This is enabled for cases where it is beneficial.
17932 auto *ZExt = dyn_cast<ZExtInst>(I);
17933 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
17934 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
17935 if (DstWidth % 8 != 0)
17936 return false;
17937
17938 auto *TruncDstType =
17940 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
17941 // the remaining ZExt folded into the user, don't use tbl lowering.
17942 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
17943 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
17946 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
17947 return false;
17948
17949 DstTy = TruncDstType;
17950 }
17951
17952 // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
17953 // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
17954 // most one extra extend step is needed and using tbl is not profitable.
17955 // Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a
17956 // udot instruction.
17957 if (SrcWidth * 4 <= DstWidth) {
17958 if (all_of(I->users(), [&](auto *U) {
17959 using namespace llvm::PatternMatch;
17960 auto *SingleUser = cast<Instruction>(&*U);
17961 if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
17962 return true;
17963 if (match(SingleUser,
17964 m_Intrinsic<Intrinsic::vector_partial_reduce_add>(
17965 m_Value(), m_Specific(I))))
17966 return true;
17967 return false;
17968 }))
17969 return false;
17970 }
17971
17972 if (DstTy->getScalarSizeInBits() >= 64)
17973 return false;
17974
17975 IRBuilder<> Builder(ZExt);
17977 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
17978 DstTy, Subtarget->isLittleEndian());
17979 if (!Result)
17980 return false;
17981 ZExt->replaceAllUsesWith(Result);
17982 ZExt->eraseFromParent();
17983 return true;
17984 }
17985
17986 auto *UIToFP = dyn_cast<UIToFPInst>(I);
17987 if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
17988 DstTy->getElementType()->isFloatTy()) ||
17989 (SrcTy->getElementType()->isIntegerTy(16) &&
17990 DstTy->getElementType()->isDoubleTy()))) {
17991 IRBuilder<> Builder(I);
17993 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
17994 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
17995 assert(ZExt && "Cannot fail for the i8 to float conversion");
17996 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
17997 I->replaceAllUsesWith(UI);
17998 I->eraseFromParent();
17999 return true;
18000 }
18001
18002 auto *SIToFP = dyn_cast<SIToFPInst>(I);
18003 if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
18004 DstTy->getElementType()->isFloatTy()) {
18005 IRBuilder<> Builder(I);
18006 auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
18008 Subtarget->isLittleEndian());
18009 assert(Shuffle && "Cannot fail for the i8 to float conversion");
18010 auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
18011 auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
18012 auto *SI = Builder.CreateSIToFP(AShr, DstTy);
18013 I->replaceAllUsesWith(SI);
18014 I->eraseFromParent();
18015 return true;
18016 }
18017
18018 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
18019 // followed by a truncate lowered to using tbl.4.
18020 auto *FPToUI = dyn_cast<FPToUIInst>(I);
18021 if (FPToUI &&
18022 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
18023 SrcTy->getElementType()->isFloatTy() &&
18024 DstTy->getElementType()->isIntegerTy(8)) {
18025 IRBuilder<> Builder(I);
18026 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
18027 VectorType::getInteger(SrcTy));
18028 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
18029 I->replaceAllUsesWith(TruncI);
18030 I->eraseFromParent();
18031 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
18032 return true;
18033 }
18034
18035 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
18036 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
18037 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
18038 // registers
18039 auto *TI = dyn_cast<TruncInst>(I);
18040 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
18041 ((SrcTy->getElementType()->isIntegerTy(32) ||
18042 SrcTy->getElementType()->isIntegerTy(64)) &&
18043 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
18044 createTblForTrunc(TI, Subtarget->isLittleEndian());
18045 return true;
18046 }
18047
18048 return false;
18049}
18050
18052 Align &RequiredAlignment) const {
18053 if (!LoadedType.isSimple() ||
18054 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
18055 return false;
18056 // Cyclone supports unaligned accesses.
18057 RequiredAlignment = Align(1);
18058 unsigned NumBits = LoadedType.getSizeInBits();
18059 return NumBits == 32 || NumBits == 64;
18060}
18061
18062/// A helper function for determining the number of interleaved accesses we
18063/// will generate when lowering accesses of the given type.
18065 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
18066 unsigned VecSize = 128;
18067 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
18068 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
18069 if (UseScalable && isa<FixedVectorType>(VecTy))
18070 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
18071 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
18072}
18073
18076 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
18077 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
18078 return MOStridedAccess;
18080}
18081
18083 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
18084 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
18085 auto EC = VecTy->getElementCount();
18086 unsigned MinElts = EC.getKnownMinValue();
18087
18088 UseScalable = false;
18089
18090 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
18091 (!Subtarget->useSVEForFixedLengthVectors() ||
18093 return false;
18094
18095 if (isa<ScalableVectorType>(VecTy) &&
18096 !Subtarget->isSVEorStreamingSVEAvailable())
18097 return false;
18098
18099 // Ensure the number of vector elements is greater than 1.
18100 if (MinElts < 2)
18101 return false;
18102
18103 // Ensure the element type is legal.
18104 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
18105 return false;
18106
18107 if (EC.isScalable()) {
18108 UseScalable = true;
18109 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
18110 }
18111
18112 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
18113 if (Subtarget->useSVEForFixedLengthVectors()) {
18114 unsigned MinSVEVectorSize =
18115 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
18116 if (VecSize % MinSVEVectorSize == 0 ||
18117 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
18118 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
18119 UseScalable = true;
18120 return true;
18121 }
18122 }
18123
18124 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
18125 // 128 will be split into multiple interleaved accesses.
18126 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
18127}
18128
18130 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
18131 return ScalableVectorType::get(VTy->getElementType(), 2);
18132
18133 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
18134 return ScalableVectorType::get(VTy->getElementType(), 4);
18135
18136 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
18137 return ScalableVectorType::get(VTy->getElementType(), 8);
18138
18139 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
18140 return ScalableVectorType::get(VTy->getElementType(), 8);
18141
18142 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
18143 return ScalableVectorType::get(VTy->getElementType(), 2);
18144
18145 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
18146 return ScalableVectorType::get(VTy->getElementType(), 4);
18147
18148 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
18149 return ScalableVectorType::get(VTy->getElementType(), 8);
18150
18151 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
18152 return ScalableVectorType::get(VTy->getElementType(), 16);
18153
18154 llvm_unreachable("Cannot handle input vector type");
18155}
18156
18157static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
18158 bool Scalable, Type *LDVTy,
18159 Type *PtrTy) {
18160 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
18161 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
18162 Intrinsic::aarch64_sve_ld3_sret,
18163 Intrinsic::aarch64_sve_ld4_sret};
18164 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
18165 Intrinsic::aarch64_neon_ld3,
18166 Intrinsic::aarch64_neon_ld4};
18167 if (Scalable)
18168 return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy});
18169
18170 return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
18171 {LDVTy, PtrTy});
18172}
18173
18174static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
18175 bool Scalable, Type *STVTy,
18176 Type *PtrTy) {
18177 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
18178 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
18179 Intrinsic::aarch64_sve_st3,
18180 Intrinsic::aarch64_sve_st4};
18181 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
18182 Intrinsic::aarch64_neon_st3,
18183 Intrinsic::aarch64_neon_st4};
18184 if (Scalable)
18185 return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy});
18186
18187 return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
18188 {STVTy, PtrTy});
18189}
18190
18191/// Lower an interleaved load into a ldN intrinsic.
18192///
18193/// E.g. Lower an interleaved load (Factor = 2):
18194/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
18195/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
18196/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
18197///
18198/// Into:
18199/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
18200/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
18201/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
18203 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
18204 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
18205 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
18206 "Invalid interleave factor");
18207 assert(!Shuffles.empty() && "Empty shufflevector input");
18208 assert(Shuffles.size() == Indices.size() &&
18209 "Unmatched number of shufflevectors and indices");
18210
18211 auto *LI = dyn_cast<LoadInst>(Load);
18212 if (!LI)
18213 return false;
18214 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
18215
18216 const DataLayout &DL = LI->getDataLayout();
18217
18218 VectorType *VTy = Shuffles[0]->getType();
18219
18220 // Skip if we do not have NEON and skip illegal vector types. We can
18221 // "legalize" wide vector types into multiple interleaved accesses as long as
18222 // the vector types are divisible by 128.
18223 bool UseScalable;
18224 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18225 return false;
18226
18227 // Check if the interleave is a zext(shuffle), that can be better optimized
18228 // into shift / and masks. For the moment we do this just for uitofp (not
18229 // zext) to avoid issues with widening instructions.
18230 if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
18231 using namespace llvm::PatternMatch;
18232 return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
18233 SI->getType()->getScalarSizeInBits() * 4 ==
18234 SI->user_back()->getType()->getScalarSizeInBits();
18235 }))
18236 return false;
18237
18238 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
18239
18240 auto *FVTy = cast<FixedVectorType>(VTy);
18241
18242 // A pointer vector can not be the return type of the ldN intrinsics. Need to
18243 // load integer vectors first and then convert to pointer vectors.
18244 Type *EltTy = FVTy->getElementType();
18245 if (EltTy->isPointerTy())
18246 FVTy =
18247 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
18248
18249 // If we're going to generate more than one load, reset the sub-vector type
18250 // to something legal.
18251 FVTy = FixedVectorType::get(FVTy->getElementType(),
18252 FVTy->getNumElements() / NumLoads);
18253
18254 auto *LDVTy =
18255 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
18256
18257 IRBuilder<> Builder(LI);
18258
18259 // The base address of the load.
18260 Value *BaseAddr = LI->getPointerOperand();
18261
18262 Type *PtrTy = LI->getPointerOperandType();
18263 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
18264 LDVTy->getElementCount());
18265
18266 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
18267 UseScalable, LDVTy, PtrTy);
18268
18269 // Holds sub-vectors extracted from the load intrinsic return values. The
18270 // sub-vectors are associated with the shufflevector instructions they will
18271 // replace.
18273
18274 Value *PTrue = nullptr;
18275 if (UseScalable) {
18276 std::optional<unsigned> PgPattern =
18277 getSVEPredPatternFromNumElements(FVTy->getNumElements());
18278 if (Subtarget->getMinSVEVectorSizeInBits() ==
18279 Subtarget->getMaxSVEVectorSizeInBits() &&
18280 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
18281 PgPattern = AArch64SVEPredPattern::all;
18282
18283 auto *PTruePat =
18284 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
18285 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
18286 {PTruePat});
18287 }
18288
18289 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
18290
18291 // If we're generating more than one load, compute the base address of
18292 // subsequent loads as an offset from the previous.
18293 if (LoadCount > 0)
18294 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
18295 FVTy->getNumElements() * Factor);
18296
18297 CallInst *LdN;
18298 if (UseScalable)
18299 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
18300 else
18301 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
18302
18303 // Extract and store the sub-vectors returned by the load intrinsic.
18304 for (unsigned i = 0; i < Shuffles.size(); i++) {
18305 ShuffleVectorInst *SVI = Shuffles[i];
18306 unsigned Index = Indices[i];
18307
18308 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
18309
18310 if (UseScalable)
18311 SubVec = Builder.CreateExtractVector(FVTy, SubVec, uint64_t(0));
18312
18313 // Convert the integer vector to pointer vector if the element is pointer.
18314 if (EltTy->isPointerTy())
18315 SubVec = Builder.CreateIntToPtr(
18317 FVTy->getNumElements()));
18318
18319 SubVecs[SVI].push_back(SubVec);
18320 }
18321 }
18322
18323 // Replace uses of the shufflevector instructions with the sub-vectors
18324 // returned by the load intrinsic. If a shufflevector instruction is
18325 // associated with more than one sub-vector, those sub-vectors will be
18326 // concatenated into a single wide vector.
18327 for (ShuffleVectorInst *SVI : Shuffles) {
18328 auto &SubVec = SubVecs[SVI];
18329 auto *WideVec =
18330 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
18331 SVI->replaceAllUsesWith(WideVec);
18332 }
18333
18334 return true;
18335}
18336
18337template <typename Iter>
18338bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
18339 int MaxLookupDist = 20;
18340 unsigned IdxWidth = DL.getIndexSizeInBits(0);
18341 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
18342 const Value *PtrA1 =
18344
18345 while (++It != End) {
18346 if (It->isDebugOrPseudoInst())
18347 continue;
18348 if (MaxLookupDist-- == 0)
18349 break;
18350 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
18351 const Value *PtrB1 =
18352 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
18353 DL, OffsetB);
18354 if (PtrA1 == PtrB1 &&
18355 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
18356 .abs() == 16)
18357 return true;
18358 }
18359 }
18360
18361 return false;
18362}
18363
18364/// Lower an interleaved store into a stN intrinsic.
18365///
18366/// E.g. Lower an interleaved store (Factor = 3):
18367/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
18368/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
18369/// store <12 x i32> %i.vec, <12 x i32>* %ptr
18370///
18371/// Into:
18372/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
18373/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
18374/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
18375/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
18376///
18377/// Note that the new shufflevectors will be removed and we'll only generate one
18378/// st3 instruction in CodeGen.
18379///
18380/// Example for a more general valid mask (Factor 3). Lower:
18381/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
18382/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
18383/// store <12 x i32> %i.vec, <12 x i32>* %ptr
18384///
18385/// Into:
18386/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
18387/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
18388/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
18389/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
18391 Value *LaneMask,
18392 ShuffleVectorInst *SVI,
18393 unsigned Factor,
18394 const APInt &GapMask) const {
18395
18396 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
18397 "Invalid interleave factor");
18398 auto *SI = dyn_cast<StoreInst>(Store);
18399 if (!SI)
18400 return false;
18401 assert(!LaneMask && GapMask.popcount() == Factor &&
18402 "Unexpected mask on store");
18403
18404 auto *VecTy = cast<FixedVectorType>(SVI->getType());
18405 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
18406
18407 unsigned LaneLen = VecTy->getNumElements() / Factor;
18408 Type *EltTy = VecTy->getElementType();
18409 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
18410
18411 const DataLayout &DL = SI->getDataLayout();
18412 bool UseScalable;
18413
18414 // Skip if we do not have NEON and skip illegal vector types. We can
18415 // "legalize" wide vector types into multiple interleaved accesses as long as
18416 // the vector types are divisible by 128.
18417 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
18418 return false;
18419
18420 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
18421
18422 Value *Op0 = SVI->getOperand(0);
18423 Value *Op1 = SVI->getOperand(1);
18424 IRBuilder<> Builder(SI);
18425
18426 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
18427 // vectors to integer vectors.
18428 if (EltTy->isPointerTy()) {
18429 Type *IntTy = DL.getIntPtrType(EltTy);
18430 unsigned NumOpElts =
18431 cast<FixedVectorType>(Op0->getType())->getNumElements();
18432
18433 // Convert to the corresponding integer vector.
18434 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
18435 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
18436 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
18437
18438 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
18439 }
18440
18441 // If we're going to generate more than one store, reset the lane length
18442 // and sub-vector type to something legal.
18443 LaneLen /= NumStores;
18444 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
18445
18446 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
18447 : SubVecTy;
18448
18449 // The base address of the store.
18450 Value *BaseAddr = SI->getPointerOperand();
18451
18452 auto Mask = SVI->getShuffleMask();
18453
18454 // Sanity check if all the indices are NOT in range.
18455 // If mask is `poison`, `Mask` may be a vector of -1s.
18456 // If all of them are `poison`, OOB read will happen later.
18457 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
18458 return false;
18459 }
18460 // A 64bit st2 which does not start at element 0 will involved adding extra
18461 // ext elements making the st2 unprofitable, and if there is a nearby store
18462 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
18463 // zip;ldp pair which has higher throughput.
18464 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
18465 (Mask[0] != 0 ||
18466 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
18467 DL) ||
18468 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
18469 BaseAddr, DL)))
18470 return false;
18471
18472 Type *PtrTy = SI->getPointerOperandType();
18473 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
18474 STVTy->getElementCount());
18475
18476 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18477 UseScalable, STVTy, PtrTy);
18478
18479 Value *PTrue = nullptr;
18480 if (UseScalable) {
18481 std::optional<unsigned> PgPattern =
18482 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
18483 if (Subtarget->getMinSVEVectorSizeInBits() ==
18484 Subtarget->getMaxSVEVectorSizeInBits() &&
18485 Subtarget->getMinSVEVectorSizeInBits() ==
18486 DL.getTypeSizeInBits(SubVecTy))
18487 PgPattern = AArch64SVEPredPattern::all;
18488
18489 auto *PTruePat =
18490 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
18491 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
18492 {PTruePat});
18493 }
18494
18495 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
18496
18498
18499 // Split the shufflevector operands into sub vectors for the new stN call.
18500 for (unsigned i = 0; i < Factor; i++) {
18501 Value *Shuffle;
18502 unsigned IdxI = StoreCount * LaneLen * Factor + i;
18503 if (Mask[IdxI] >= 0) {
18504 Shuffle = Builder.CreateShuffleVector(
18505 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
18506 } else {
18507 unsigned StartMask = 0;
18508 for (unsigned j = 1; j < LaneLen; j++) {
18509 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
18510 if (Mask[IdxJ] >= 0) {
18511 StartMask = Mask[IdxJ] - j;
18512 break;
18513 }
18514 }
18515 // Note: Filling undef gaps with random elements is ok, since
18516 // those elements were being written anyway (with undefs).
18517 // In the case of all undefs we're defaulting to using elems from 0
18518 // Note: StartMask cannot be negative, it's checked in
18519 // isReInterleaveMask
18520 Shuffle = Builder.CreateShuffleVector(
18521 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
18522 }
18523
18524 if (UseScalable)
18525 Shuffle = Builder.CreateInsertVector(STVTy, PoisonValue::get(STVTy),
18526 Shuffle, uint64_t(0));
18527
18528 Ops.push_back(Shuffle);
18529 }
18530
18531 if (UseScalable)
18532 Ops.push_back(PTrue);
18533
18534 // If we generating more than one store, we compute the base address of
18535 // subsequent stores as an offset from the previous.
18536 if (StoreCount > 0)
18537 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
18538 BaseAddr, LaneLen * Factor);
18539
18540 Ops.push_back(BaseAddr);
18541 Builder.CreateCall(StNFunc, Ops);
18542 }
18543 return true;
18544}
18545
18547 Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
18548 const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
18549 if (Factor != 2 && Factor != 3 && Factor != 4) {
18550 LLVM_DEBUG(dbgs() << "Matching ld2, ld3 and ld4 patterns failed\n");
18551 return false;
18552 }
18553 auto *LI = dyn_cast<LoadInst>(Load);
18554 if (!LI)
18555 return false;
18556 assert(!Mask && "Unexpected mask on a load\n");
18557
18559
18560 const DataLayout &DL = LI->getModule()->getDataLayout();
18561 bool UseScalable;
18562 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18563 return false;
18564
18565 // TODO: Add support for using SVE instructions with fixed types later, using
18566 // the code from lowerInterleavedLoad to obtain the correct container type.
18567 if (UseScalable && !VTy->isScalableTy())
18568 return false;
18569
18570 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
18571 VectorType *LdTy =
18573 VTy->getElementCount().divideCoefficientBy(NumLoads));
18574
18575 Type *PtrTy = LI->getPointerOperandType();
18576 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
18577 UseScalable, LdTy, PtrTy);
18578
18579 IRBuilder<> Builder(LI);
18580 Value *Pred = nullptr;
18581 if (UseScalable)
18582 Pred =
18583 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
18584
18585 Value *BaseAddr = LI->getPointerOperand();
18586 Value *Result = nullptr;
18587 if (NumLoads > 1) {
18588 // Create multiple legal small ldN.
18589 SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
18590 for (unsigned I = 0; I < NumLoads; ++I) {
18591 Value *Offset = Builder.getInt64(I * Factor);
18592
18593 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
18594 Value *LdN = nullptr;
18595 if (UseScalable)
18596 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
18597 else
18598 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
18599 Value *Idx =
18600 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
18601 for (unsigned J = 0; J < Factor; ++J) {
18602 ExtractedLdValues[J] = Builder.CreateInsertVector(
18603 VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
18604 }
18605 LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
18606 }
18607
18608 // Merge the values from different factors.
18609 Result = PoisonValue::get(DI->getType());
18610 for (unsigned J = 0; J < Factor; ++J)
18611 Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
18612 } else {
18613 if (UseScalable)
18614 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
18615 else
18616 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
18617 }
18618
18619 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
18620 DI->replaceAllUsesWith(Result);
18621 return true;
18622}
18623
18625 Instruction *Store, Value *Mask,
18626 ArrayRef<Value *> InterleavedValues) const {
18627 unsigned Factor = InterleavedValues.size();
18628 if (Factor != 2 && Factor != 3 && Factor != 4) {
18629 LLVM_DEBUG(dbgs() << "Matching st2, st3 and st4 patterns failed\n");
18630 return false;
18631 }
18633 if (!SI)
18634 return false;
18635 assert(!Mask && "Unexpected mask on plain store");
18636
18637 VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
18638 const DataLayout &DL = SI->getModule()->getDataLayout();
18639
18640 bool UseScalable;
18641 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18642 return false;
18643
18644 // TODO: Add support for using SVE instructions with fixed types later, using
18645 // the code from lowerInterleavedStore to obtain the correct container type.
18646 if (UseScalable && !VTy->isScalableTy())
18647 return false;
18648
18649 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
18650
18651 VectorType *StTy =
18653 VTy->getElementCount().divideCoefficientBy(NumStores));
18654
18655 Type *PtrTy = SI->getPointerOperandType();
18656 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18657 UseScalable, StTy, PtrTy);
18658
18659 IRBuilder<> Builder(SI);
18660
18661 Value *BaseAddr = SI->getPointerOperand();
18662 Value *Pred = nullptr;
18663
18664 if (UseScalable)
18665 Pred =
18666 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
18667
18668 auto ExtractedValues = InterleavedValues;
18669 SmallVector<Value *, 4> StoreOperands(InterleavedValues);
18670 if (UseScalable)
18671 StoreOperands.push_back(Pred);
18672 StoreOperands.push_back(BaseAddr);
18673 for (unsigned I = 0; I < NumStores; ++I) {
18674 Value *Address = BaseAddr;
18675 if (NumStores > 1) {
18676 Value *Offset = Builder.getInt64(I * Factor);
18677 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
18678 Value *Idx =
18679 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
18680 for (unsigned J = 0; J < Factor; J++) {
18681 StoreOperands[J] =
18682 Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
18683 }
18684 // update the address
18685 StoreOperands[StoreOperands.size() - 1] = Address;
18686 }
18687 Builder.CreateCall(StNFunc, StoreOperands);
18688 }
18689 return true;
18690}
18691
18693 LLVMContext &Context, const MemOp &Op,
18694 const AttributeList &FuncAttributes) const {
18695 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18696 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18697 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18698 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18699 // taken one instruction to materialize the v2i64 zero and one store (with
18700 // restrictive addressing mode). Just do i64 stores.
18701 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18702 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18703 if (Op.isAligned(AlignCheck))
18704 return true;
18705 unsigned Fast;
18706 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18708 Fast;
18709 };
18710
18711 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18712 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
18713 return MVT::v16i8;
18714 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18715 return MVT::f128;
18716 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18717 return MVT::i64;
18718 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18719 return MVT::i32;
18720 return MVT::Other;
18721}
18722
18724 const MemOp &Op, const AttributeList &FuncAttributes) const {
18725 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18726 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18727 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18728 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18729 // taken one instruction to materialize the v2i64 zero and one store (with
18730 // restrictive addressing mode). Just do i64 stores.
18731 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18732 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18733 if (Op.isAligned(AlignCheck))
18734 return true;
18735 unsigned Fast;
18736 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18738 Fast;
18739 };
18740
18741 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18742 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
18743 return LLT::fixed_vector(2, 64);
18744 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18745 return LLT::scalar(128);
18746 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18747 return LLT::scalar(64);
18748 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18749 return LLT::scalar(32);
18750 return LLT();
18751}
18752
18753// 12-bit optionally shifted immediates are legal for adds.
18755 if (Immed == std::numeric_limits<int64_t>::min()) {
18756 return false;
18757 }
18758 // Same encoding for add/sub, just flip the sign.
18759 return isLegalArithImmed((uint64_t)std::abs(Immed));
18760}
18761
18763 // We will only emit addvl/inc* instructions for SVE2
18764 if (!Subtarget->hasSVE2())
18765 return false;
18766
18767 // addvl's immediates are in terms of the number of bytes in a register.
18768 // Since there are 16 in the base supported size (128bits), we need to
18769 // divide the immediate by that much to give us a useful immediate to
18770 // multiply by vscale. We can't have a remainder as a result of this.
18771 if (Imm % 16 == 0)
18772 return isInt<6>(Imm / 16);
18773
18774 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
18775 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
18776 // of addvl as a result, so only take h|w|d into account.
18777 // Dec[h|w|d] will cover subtractions.
18778 // Immediates are in the range [1,16], so we can't do a 2's complement check.
18779 // FIXME: Can we make use of other patterns to cover other immediates?
18780
18781 // inch|dech
18782 if (Imm % 8 == 0)
18783 return std::abs(Imm / 8) <= 16;
18784 // incw|decw
18785 if (Imm % 4 == 0)
18786 return std::abs(Imm / 4) <= 16;
18787 // incd|decd
18788 if (Imm % 2 == 0)
18789 return std::abs(Imm / 2) <= 16;
18790
18791 return false;
18792}
18793
18794// Return false to prevent folding
18795// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
18796// if the folding leads to worse code.
18798 SDValue AddNode, SDValue ConstNode) const {
18799 // Let the DAGCombiner decide for vector types and large types.
18800 const EVT VT = AddNode.getValueType();
18801 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
18802 return true;
18803
18804 // It is worse if c1 is legal add immediate, while c1*c2 is not
18805 // and has to be composed by at least two instructions.
18806 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
18807 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
18808 const int64_t C1 = C1Node->getSExtValue();
18809 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
18811 return true;
18813 // Adapt to the width of a register.
18814 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
18815 AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), BitSize, Insn);
18816 if (Insn.size() > 1)
18817 return false;
18818
18819 // Default to true and let the DAGCombiner decide.
18820 return true;
18821}
18822
18823// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
18824// immediates is the same as for an add or a sub.
18826 return isLegalAddImmediate(Immed);
18827}
18828
18829/// isLegalAddressingMode - Return true if the addressing mode represented
18830/// by AM is legal for this target, for a load/store of the specified type.
18832 const AddrMode &AMode, Type *Ty,
18833 unsigned AS, Instruction *I) const {
18834 // AArch64 has five basic addressing modes:
18835 // reg
18836 // reg + 9-bit signed offset
18837 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
18838 // reg1 + reg2
18839 // reg + SIZE_IN_BYTES * reg
18840
18841 // No global is ever allowed as a base.
18842 if (AMode.BaseGV)
18843 return false;
18844
18845 // No reg+reg+imm addressing.
18846 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
18847 return false;
18848
18849 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
18850 // `2*ScaledReg` into `BaseReg + ScaledReg`
18851 AddrMode AM = AMode;
18852 if (AM.Scale && !AM.HasBaseReg) {
18853 if (AM.Scale == 1) {
18854 AM.HasBaseReg = true;
18855 AM.Scale = 0;
18856 } else if (AM.Scale == 2) {
18857 AM.HasBaseReg = true;
18858 AM.Scale = 1;
18859 } else {
18860 return false;
18861 }
18862 }
18863
18864 // A base register is required in all addressing modes.
18865 if (!AM.HasBaseReg)
18866 return false;
18867
18868 if (Ty->isScalableTy()) {
18869 if (isa<ScalableVectorType>(Ty)) {
18870 // See if we have a foldable vscale-based offset, for vector types which
18871 // are either legal or smaller than the minimum; more work will be
18872 // required if we need to consider addressing for types which need
18873 // legalization by splitting.
18874 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
18875 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
18876 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
18877 isPowerOf2_64(VecNumBytes))
18878 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
18879
18880 uint64_t VecElemNumBytes =
18881 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
18882 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
18883 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
18884 }
18885
18886 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
18887 }
18888
18889 // No scalable offsets allowed for non-scalable types.
18890 if (AM.ScalableOffset)
18891 return false;
18892
18893 // check reg + imm case:
18894 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
18895 uint64_t NumBytes = 0;
18896 if (Ty->isSized()) {
18897 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
18898 NumBytes = NumBits / 8;
18899 if (!isPowerOf2_64(NumBits))
18900 NumBytes = 0;
18901 }
18902
18903 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
18904 AM.Scale);
18905}
18906
18907// Check whether the 2 offsets belong to the same imm24 range, and their high
18908// 12bits are same, then their high part can be decoded with the offset of add.
18909int64_t
18911 int64_t MaxOffset) const {
18912 int64_t HighPart = MinOffset & ~0xfffULL;
18913 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
18914 // Rebase the value to an integer multiple of imm12.
18915 return HighPart;
18916 }
18917
18918 return 0;
18919}
18920
18922 // Consider splitting large offset of struct or array.
18923 return true;
18924}
18925
18927 const MachineFunction &MF, EVT VT) const {
18928 EVT ScalarVT = VT.getScalarType();
18929
18930 if (!ScalarVT.isSimple())
18931 return false;
18932
18933 switch (ScalarVT.getSimpleVT().SimpleTy) {
18934 case MVT::f16:
18935 return Subtarget->hasFullFP16();
18936 case MVT::f32:
18937 case MVT::f64:
18938 return true;
18939 case MVT::bf16:
18940 return VT.isScalableVector() && Subtarget->hasBF16() &&
18941 Subtarget->isNonStreamingSVEorSME2Available();
18942 default:
18943 break;
18944 }
18945
18946 return false;
18947}
18948
18950 Type *Ty) const {
18951 switch (Ty->getScalarType()->getTypeID()) {
18952 case Type::FloatTyID:
18953 case Type::DoubleTyID:
18954 return true;
18955 default:
18956 return false;
18957 }
18958}
18959
18961 EVT VT, CodeGenOptLevel OptLevel) const {
18962 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
18964}
18965
18966const MCPhysReg *
18968 // LR is a callee-save register, but we must treat it as clobbered by any call
18969 // site. Hence we include LR in the scratch registers, which are in turn added
18970 // as implicit-defs for stackmaps and patchpoints.
18971 static const MCPhysReg ScratchRegs[] = {
18972 AArch64::X16, AArch64::X17, AArch64::LR, 0
18973 };
18974 return ScratchRegs;
18975}
18976
18978 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
18979 return RCRegs;
18980}
18981
18982bool
18984 CombineLevel Level) const {
18985 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
18986 N->getOpcode() == ISD::SRL) &&
18987 "Expected shift op");
18988
18989 SDValue ShiftLHS = N->getOperand(0);
18990 EVT VT = N->getValueType(0);
18991
18992 if (!ShiftLHS->hasOneUse())
18993 return false;
18994
18995 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
18996 !ShiftLHS.getOperand(0)->hasOneUse())
18997 return false;
18998
18999 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
19000 // combine it with shift 'N' to let it be lowered to UBFX except:
19001 // ((x >> C) & mask) << C.
19002 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
19003 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
19004 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
19005 if (isMask_64(TruncMask)) {
19006 SDValue AndLHS = ShiftLHS.getOperand(0);
19007 if (AndLHS.getOpcode() == ISD::SRL) {
19008 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
19009 if (N->getOpcode() == ISD::SHL)
19010 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
19011 return SRLC->getZExtValue() == SHLC->getZExtValue();
19012 return false;
19013 }
19014 }
19015 }
19016 }
19017 return true;
19018}
19019
19021 const SDNode *N) const {
19022 assert(N->getOpcode() == ISD::XOR &&
19023 (N->getOperand(0).getOpcode() == ISD::SHL ||
19024 N->getOperand(0).getOpcode() == ISD::SRL) &&
19025 "Expected XOR(SHIFT) pattern");
19026
19027 // Only commute if the entire NOT mask is a hidden shifted mask.
19028 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
19029 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
19030 if (XorC && ShiftC) {
19031 unsigned MaskIdx, MaskLen;
19032 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
19033 unsigned ShiftAmt = ShiftC->getZExtValue();
19034 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
19035 if (N->getOperand(0).getOpcode() == ISD::SHL)
19036 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
19037 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
19038 }
19039 }
19040
19041 return false;
19042}
19043
19045 const SDNode *N) const {
19046 assert(((N->getOpcode() == ISD::SHL &&
19047 N->getOperand(0).getOpcode() == ISD::SRL) ||
19048 (N->getOpcode() == ISD::SRL &&
19049 N->getOperand(0).getOpcode() == ISD::SHL)) &&
19050 "Expected shift-shift mask");
19051 // Don't allow multiuse shift folding with the same shift amount.
19052 if (!N->getOperand(0)->hasOneUse())
19053 return false;
19054
19055 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
19056 EVT VT = N->getValueType(0);
19057 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
19058 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
19059 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
19060 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
19061 }
19062
19063 // We do not need to fold when this shifting used in specific load case:
19064 // (ldr x, (add x, (shl (srl x, c1) 2)))
19065 if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
19066 if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
19067 unsigned ShlAmt = C2->getZExtValue();
19068 if (auto ShouldADD = *N->user_begin();
19069 ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
19070 if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
19071 EVT MemVT = Load->getMemoryVT();
19072
19073 if (Load->getValueType(0).isScalableVector())
19074 return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits();
19075
19076 if (isIndexedLoadLegal(ISD::PRE_INC, MemVT))
19077 return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits();
19078 }
19079 }
19080 }
19081 }
19082
19083 return true;
19084}
19085
19087 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
19088 SDValue Y) const {
19089 return VT.isScalableVector() && isTypeLegal(VT) &&
19090 SelectOpcode == ISD::VSELECT;
19091}
19092
19094 Type *Ty) const {
19095 assert(Ty->isIntegerTy());
19096
19097 unsigned BitSize = Ty->getPrimitiveSizeInBits();
19098 if (BitSize == 0)
19099 return false;
19100
19101 int64_t Val = Imm.getSExtValue();
19102 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
19103 return true;
19104
19105 if (Val < 0)
19106 Val = ~Val;
19107 if (BitSize == 32)
19108 Val &= (1LL << 32) - 1;
19109
19110 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
19111 // MOVZ is free so return true for one or fewer MOVK.
19112 return Shift < 3;
19113}
19114
19116 unsigned Index) const {
19118 return false;
19119
19120 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
19121}
19122
19124 LLVMContext &Context, EVT VT) const {
19125 if (getTypeAction(Context, VT) != TypeExpandInteger)
19126 return false;
19127
19128 EVT LegalTy = EVT::getIntegerVT(Context, VT.getSizeInBits() / 2);
19129 return getTypeAction(Context, LegalTy) == TargetLowering::TypeLegal;
19130}
19131
19132/// Turn vector tests of the signbit in the form of:
19133/// xor (sra X, elt_size(X)-1), -1
19134/// into:
19135/// cmge X, X, #0
19137 const AArch64Subtarget *Subtarget) {
19138 EVT VT = N->getValueType(0);
19139 if (!Subtarget->hasNEON() || !VT.isVector())
19140 return SDValue();
19141
19142 // There must be a shift right algebraic before the xor, and the xor must be a
19143 // 'not' operation.
19144 SDValue Shift = N->getOperand(0);
19145 SDValue Ones = N->getOperand(1);
19146 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
19148 return SDValue();
19149
19150 // The shift should be smearing the sign bit across each vector element.
19151 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
19152 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
19153 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
19154 return SDValue();
19155
19156 SDLoc DL(N);
19157 SDValue Zero = DAG.getConstant(0, DL, Shift.getValueType());
19158 return DAG.getSetCC(DL, VT, Shift.getOperand(0), Zero, ISD::SETGE);
19159}
19160
19161// Given a vecreduce_add node, detect the below pattern and convert it to the
19162// node sequence with UABDL, [S|U]ADB and UADDLP.
19163//
19164// i32 vecreduce_add(
19165// v16i32 abs(
19166// v16i32 sub(
19167// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
19168//
19169// or
19170//
19171// i32 vecreduce_add(
19172// v16i32 zext(
19173// v16i16 abs(
19174// v16i16 sub(
19175// v16i16 [sign|zero]_extend(v16i8 a), v16i16 [sign|zero]_extend(v16i8 b))))
19176//
19177// =================>
19178// i32 vecreduce_add(
19179// v4i32 UADDLP(
19180// v8i16 add(
19181// v8i16 zext(
19182// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
19183// v8i16 zext(
19184// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
19186 SelectionDAG &DAG) {
19187 // Assumed i32 vecreduce_add
19188 if (N->getValueType(0) != MVT::i32)
19189 return SDValue();
19190
19191 SDValue VecReduceOp0 = N->getOperand(0);
19192 bool SawTrailingZext = false;
19193 // Look through an optional post-ABS ZEXT from v16i16 -> v16i32.
19194 if (VecReduceOp0.getOpcode() == ISD::ZERO_EXTEND &&
19195 VecReduceOp0->getValueType(0) == MVT::v16i32 &&
19196 VecReduceOp0->getOperand(0)->getOpcode() == ISD::ABS &&
19197 VecReduceOp0->getOperand(0)->getValueType(0) == MVT::v16i16) {
19198 SawTrailingZext = true;
19199 VecReduceOp0 = VecReduceOp0.getOperand(0);
19200 }
19201
19202 // Peel off an optional post-ABS extend (v16i16 -> v16i32).
19203 MVT AbsInputVT = SawTrailingZext ? MVT::v16i16 : MVT::v16i32;
19204 // Assumed v16i16 or v16i32 abs input
19205 unsigned Opcode = VecReduceOp0.getOpcode();
19206 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != AbsInputVT)
19207 return SDValue();
19208
19209 SDValue ABS = VecReduceOp0;
19210 // Assumed v16i16 or v16i32 sub
19211 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
19212 ABS->getOperand(0)->getValueType(0) != AbsInputVT)
19213 return SDValue();
19214
19215 SDValue SUB = ABS->getOperand(0);
19216 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
19217 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
19218 // Assumed v16i16 or v16i32 type
19219 if (SUB->getOperand(0)->getValueType(0) != AbsInputVT ||
19220 SUB->getOperand(1)->getValueType(0) != AbsInputVT)
19221 return SDValue();
19222
19223 // Assumed zext or sext
19224 bool IsZExt = false;
19225 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
19226 IsZExt = true;
19227 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
19228 IsZExt = false;
19229 } else
19230 return SDValue();
19231
19232 SDValue EXT0 = SUB->getOperand(0);
19233 SDValue EXT1 = SUB->getOperand(1);
19234 // Assumed zext's operand has v16i8 type
19235 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
19236 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
19237 return SDValue();
19238
19239 // Pattern is detected. Let's convert it to sequence of nodes.
19240 SDLoc DL(N);
19241
19242 // First, create the node pattern of UABD/SABD.
19243 SDValue UABDHigh8Op0 =
19244 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
19245 DAG.getConstant(8, DL, MVT::i64));
19246 SDValue UABDHigh8Op1 =
19247 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
19248 DAG.getConstant(8, DL, MVT::i64));
19249 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
19250 UABDHigh8Op0, UABDHigh8Op1);
19251 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
19252
19253 // Second, create the node pattern of UABAL.
19254 SDValue UABDLo8Op0 =
19255 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
19256 DAG.getConstant(0, DL, MVT::i64));
19257 SDValue UABDLo8Op1 =
19258 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
19259 DAG.getConstant(0, DL, MVT::i64));
19260 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
19261 UABDLo8Op0, UABDLo8Op1);
19262 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
19263 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
19264
19265 // Third, create the node of UADDLP.
19266 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
19267
19268 // Fourth, create the node of VECREDUCE_ADD.
19269 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
19270}
19271
19272static SDValue
19274 const AArch64Subtarget *ST) {
19275 if (DCI.isBeforeLegalize())
19276 return SDValue();
19277
19278 if (SDValue While = optimizeIncrementingWhile(N, DCI.DAG, /*IsSigned=*/false,
19279 /*IsEqual=*/false))
19280 return While;
19281
19282 if (!N->getValueType(0).isScalableVector() ||
19283 (!ST->hasSVE2p1() && !(ST->hasSME2() && ST->isStreaming())))
19284 return SDValue();
19285
19286 // Count the number of users which are extract_vectors.
19287 unsigned NumExts = count_if(N->users(), [](SDNode *Use) {
19288 return Use->getOpcode() == ISD::EXTRACT_SUBVECTOR;
19289 });
19290
19291 auto MaskEC = N->getValueType(0).getVectorElementCount();
19292 if (!MaskEC.isKnownMultipleOf(NumExts))
19293 return SDValue();
19294
19295 ElementCount ExtMinEC = MaskEC.divideCoefficientBy(NumExts);
19296 if (ExtMinEC.getKnownMinValue() < 2)
19297 return SDValue();
19298
19299 SmallVector<SDNode *> Extracts(NumExts, nullptr);
19300 for (SDNode *Use : N->users()) {
19301 if (Use->getOpcode() != ISD::EXTRACT_SUBVECTOR)
19302 continue;
19303
19304 // Ensure the extract type is correct (e.g. if NumExts is 4 and
19305 // the mask return type is nxv8i1, each extract should be nxv2i1.
19306 if (Use->getValueType(0).getVectorElementCount() != ExtMinEC)
19307 return SDValue();
19308
19309 // There should be exactly one extract for each part of the mask.
19310 unsigned Offset = Use->getConstantOperandVal(1);
19311 unsigned Part = Offset / ExtMinEC.getKnownMinValue();
19312 if (Extracts[Part] != nullptr)
19313 return SDValue();
19314
19315 Extracts[Part] = Use;
19316 }
19317
19318 SelectionDAG &DAG = DCI.DAG;
19319 SDLoc DL(N);
19320 SDValue ID =
19321 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
19322
19323 SDValue Idx = N->getOperand(0);
19324 SDValue TC = N->getOperand(1);
19325 if (Idx.getValueType() != MVT::i64) {
19326 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
19327 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
19328 }
19329
19330 // Create the whilelo_x2 intrinsics from each pair of extracts
19331 EVT ExtVT = Extracts[0]->getValueType(0);
19332 EVT DoubleExtVT = ExtVT.getDoubleNumVectorElementsVT(*DAG.getContext());
19333 auto R =
19334 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
19335 DCI.CombineTo(Extracts[0], R.getValue(0));
19336 DCI.CombineTo(Extracts[1], R.getValue(1));
19337 SmallVector<SDValue> Concats = {DAG.getNode(
19338 ISD::CONCAT_VECTORS, DL, DoubleExtVT, R.getValue(0), R.getValue(1))};
19339
19340 if (NumExts == 2) {
19341 assert(N->getValueType(0) == DoubleExtVT);
19342 return Concats[0];
19343 }
19344
19345 auto Elts =
19346 DAG.getElementCount(DL, MVT::i64, ExtVT.getVectorElementCount() * 2);
19347 for (unsigned I = 2; I < NumExts; I += 2) {
19348 // After the first whilelo_x2, we need to increment the starting value.
19349 Idx = DAG.getNode(ISD::UADDSAT, DL, MVT::i64, Idx, Elts);
19350 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
19351 DCI.CombineTo(Extracts[I], R.getValue(0));
19352 DCI.CombineTo(Extracts[I + 1], R.getValue(1));
19353 Concats.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, DoubleExtVT,
19354 R.getValue(0), R.getValue(1)));
19355 }
19356
19357 return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), Concats);
19358}
19359
19360// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
19361// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
19362// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
19363// If we have vectors larger than v16i8 we extract v16i8 vectors,
19364// Follow the same steps above to get DOT instructions concatenate them
19365// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
19367 const AArch64Subtarget *ST) {
19368 if (!ST->isNeonAvailable())
19369 return SDValue();
19370
19371 if (!ST->hasDotProd())
19373
19374 SDValue Op0 = N->getOperand(0);
19375 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
19376 Op0.getValueType().getVectorElementType() != MVT::i32)
19377 return SDValue();
19378
19379 unsigned ExtOpcode = Op0.getOpcode();
19380 SDValue A = Op0;
19381 SDValue B;
19382 unsigned DotOpcode;
19383 if (ExtOpcode == ISD::MUL) {
19384 A = Op0.getOperand(0);
19385 B = Op0.getOperand(1);
19386 if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
19387 return SDValue();
19388 auto OpCodeA = A.getOpcode();
19389 if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
19390 return SDValue();
19391
19392 auto OpCodeB = B.getOpcode();
19393 if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
19394 return SDValue();
19395
19396 if (OpCodeA == OpCodeB) {
19397 DotOpcode =
19398 OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;
19399 } else {
19400 // Check USDOT support support
19401 if (!ST->hasMatMulInt8())
19402 return SDValue();
19403 DotOpcode = AArch64ISD::USDOT;
19404 if (OpCodeA == ISD::SIGN_EXTEND)
19405 std::swap(A, B);
19406 }
19407 } else if (ExtOpcode == ISD::ZERO_EXTEND) {
19408 DotOpcode = AArch64ISD::UDOT;
19409 } else if (ExtOpcode == ISD::SIGN_EXTEND) {
19410 DotOpcode = AArch64ISD::SDOT;
19411 } else {
19412 return SDValue();
19413 }
19414
19415 EVT Op0VT = A.getOperand(0).getValueType();
19416 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
19417 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
19418 if (!IsValidElementCount || !IsValidSize)
19419 return SDValue();
19420
19421 SDLoc DL(Op0);
19422 // For non-mla reductions B can be set to 1. For MLA we take the operand of
19423 // the extend B.
19424 if (!B)
19425 B = DAG.getConstant(1, DL, Op0VT);
19426 else
19427 B = B.getOperand(0);
19428
19429 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
19430 unsigned NumOfVecReduce;
19431 EVT TargetType;
19432 if (IsMultipleOf16) {
19433 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
19434 TargetType = MVT::v4i32;
19435 } else {
19436 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
19437 TargetType = MVT::v2i32;
19438 }
19439 // Handle the case where we need to generate only one Dot operation.
19440 if (NumOfVecReduce == 1) {
19441 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
19442 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
19443 A.getOperand(0), B);
19444 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
19445 }
19446 // Generate Dot instructions that are multiple of 16.
19447 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
19448 SmallVector<SDValue, 4> SDotVec16;
19449 unsigned I = 0;
19450 for (; I < VecReduce16Num; I += 1) {
19451 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
19452 SDValue Op0 =
19453 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
19454 DAG.getConstant(I * 16, DL, MVT::i64));
19455 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
19456 DAG.getConstant(I * 16, DL, MVT::i64));
19457 SDValue Dot =
19458 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
19459 SDotVec16.push_back(Dot);
19460 }
19461 // Concatenate dot operations.
19462 EVT SDot16EVT =
19463 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
19464 SDValue ConcatSDot16 =
19465 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
19466 SDValue VecReduceAdd16 =
19467 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
19468 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
19469 if (VecReduce8Num == 0)
19470 return VecReduceAdd16;
19471
19472 // Generate the remainder Dot operation that is multiple of 8.
19473 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
19474 SDValue Vec8Op0 =
19475 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
19476 DAG.getConstant(I * 16, DL, MVT::i64));
19477 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
19478 DAG.getConstant(I * 16, DL, MVT::i64));
19479 SDValue Dot =
19480 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
19481 SDValue VecReduceAdd8 =
19482 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
19483 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
19484 VecReduceAdd8);
19485}
19486
19487// Given an (integer) vecreduce, we know the order of the inputs does not
19488// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
19489// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
19490// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
19492 auto DetectAddExtract = [&](SDValue A) {
19493 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
19494 // UADDLP(x) if found.
19495 assert(A.getOpcode() == ISD::ADD);
19496 EVT VT = A.getValueType();
19497 SDValue Op0 = A.getOperand(0);
19498 SDValue Op1 = A.getOperand(1);
19499 if (Op0.getOpcode() != Op1.getOpcode() ||
19500 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
19501 Op0.getOpcode() != ISD::SIGN_EXTEND))
19502 return SDValue();
19503 SDValue Ext0 = Op0.getOperand(0);
19504 SDValue Ext1 = Op1.getOperand(0);
19505 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
19507 Ext0.getOperand(0) != Ext1.getOperand(0) ||
19509 return SDValue();
19510 // Check that the type is twice the add types, and the extract are from
19511 // upper/lower parts of the same source.
19513 VT.getVectorNumElements() * 2)
19514 return SDValue();
19515 if ((Ext0.getConstantOperandVal(1) != 0 ||
19517 (Ext1.getConstantOperandVal(1) != 0 ||
19519 return SDValue();
19520 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
19521 : AArch64ISD::SADDLP;
19522 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
19523 };
19524
19525 if (SDValue R = DetectAddExtract(A))
19526 return R;
19527
19528 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
19529 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
19530 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19531 A.getOperand(1));
19532 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
19533 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
19534 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19535 A.getOperand(0));
19536 return SDValue();
19537}
19538
19539// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
19540// UADDLV(concat), where the concat represents the 64-bit zext sources.
19542 // Look for add(zext(64-bit source), zext(64-bit source)), returning
19543 // UADDLV(concat(zext, zext)) if found.
19544 assert(A.getOpcode() == ISD::ADD);
19545 EVT VT = A.getValueType();
19546 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19547 return SDValue();
19548 SDValue Op0 = A.getOperand(0);
19549 SDValue Op1 = A.getOperand(1);
19550 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
19551 return SDValue();
19552 SDValue Ext0 = Op0.getOperand(0);
19553 SDValue Ext1 = Op1.getOperand(0);
19554 EVT ExtVT0 = Ext0.getValueType();
19555 EVT ExtVT1 = Ext1.getValueType();
19556 // Check zext VTs are the same and 64-bit length.
19557 if (ExtVT0 != ExtVT1 ||
19558 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
19559 return SDValue();
19560 // Get VT for concat of zext sources.
19561 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
19562 SDValue Concat =
19563 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
19564
19565 switch (VT.getSimpleVT().SimpleTy) {
19566 case MVT::v2i64:
19567 case MVT::v4i32:
19568 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
19569 case MVT::v8i16: {
19570 SDValue Uaddlv =
19571 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
19572 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
19573 }
19574 default:
19575 llvm_unreachable("Unhandled vector type");
19576 }
19577}
19578
19580 SDValue A = N->getOperand(0);
19581 if (A.getOpcode() == ISD::ADD) {
19582 if (SDValue R = performUADDVAddCombine(A, DAG))
19583 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
19584 else if (SDValue R = performUADDVZextCombine(A, DAG))
19585 return R;
19586 }
19587
19588 // uaddv(A) --> A if all lanes of A are known to be zeros except the 0th lane.
19589 MVT OpVT = A.getSimpleValueType();
19590 assert(N->getSimpleValueType(0) == OpVT &&
19591 "The operand type should be consistent with the result type of UADDV");
19593 Mask.clearBit(0);
19594 KnownBits KnownLeadingLanes = DAG.computeKnownBits(A, Mask);
19595 if (KnownLeadingLanes.isZero())
19596 return A;
19597
19598 return SDValue();
19599}
19600
19603 const AArch64Subtarget *Subtarget) {
19604 if (DCI.isBeforeLegalizeOps())
19605 return SDValue();
19606
19607 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
19608}
19609
19610SDValue
19611AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
19612 SelectionDAG &DAG,
19613 SmallVectorImpl<SDNode *> &Created) const {
19614 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
19615 if (isIntDivCheap(N->getValueType(0), Attr))
19616 return SDValue(N, 0); // Lower SDIV as SDIV
19617
19618 EVT VT = N->getValueType(0);
19619
19620 // If SVE is available, we can generate
19621 // sdiv(x,y) -> ptrue + asrd , where 'y' is positive pow-2 divisor.
19622 // sdiv(x,y) -> ptrue + asrd + subr , where 'y' is negative pow-2 divisor.
19623 if (VT.isVector() && Subtarget->isSVEorStreamingSVEAvailable())
19624 return SDValue(N, 0);
19625
19626 // fold (sdiv X, pow2)
19627 if ((VT != MVT::i32 && VT != MVT::i64) ||
19628 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
19629 return SDValue();
19630
19631 // If the divisor is 2 or -2, the default expansion is better. It will add
19632 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
19633 if (Divisor == 2 ||
19634 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
19635 return SDValue();
19636
19637 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
19638}
19639
19640SDValue
19641AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
19642 SelectionDAG &DAG,
19643 SmallVectorImpl<SDNode *> &Created) const {
19644 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
19645 if (isIntDivCheap(N->getValueType(0), Attr))
19646 return SDValue(N, 0); // Lower SREM as SREM
19647
19648 EVT VT = N->getValueType(0);
19649
19650 // For scalable and fixed types, mark them as cheap so we can handle it much
19651 // later. This allows us to handle larger than legal types.
19652 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
19653 return SDValue(N, 0);
19654
19655 // fold (srem X, pow2)
19656 if ((VT != MVT::i32 && VT != MVT::i64) ||
19657 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
19658 return SDValue();
19659
19660 unsigned Lg2 = Divisor.countr_zero();
19661 if (Lg2 == 0)
19662 return SDValue();
19663
19664 SDLoc DL(N);
19665 SDValue N0 = N->getOperand(0);
19666 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
19667 SDValue Zero = DAG.getConstant(0, DL, VT);
19668 SDValue CCVal, CSNeg;
19669 if (Lg2 == 1) {
19670 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
19671 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19672 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
19673
19674 Created.push_back(Cmp.getNode());
19675 Created.push_back(And.getNode());
19676 } else {
19677 SDValue CCVal = getCondCode(DAG, AArch64CC::MI);
19678 SDVTList VTs = DAG.getVTList(VT, FlagsVT);
19679
19680 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
19681 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19682 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
19683 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
19684 Negs.getValue(1));
19685
19686 Created.push_back(Negs.getNode());
19687 Created.push_back(AndPos.getNode());
19688 Created.push_back(AndNeg.getNode());
19689 }
19690
19691 return CSNeg;
19692}
19693
19695 switch(getIntrinsicID(S.getNode())) {
19696 default:
19697 break;
19698 case Intrinsic::aarch64_sve_cntb:
19699 case Intrinsic::aarch64_sve_cnth:
19700 case Intrinsic::aarch64_sve_cntw:
19701 case Intrinsic::aarch64_sve_cntd:
19702 return true;
19703 }
19704 return false;
19705}
19706
19707// Returns the maximum (scalable) value that can be returned by an SVE count
19708// intrinsic. Returns std::nullopt if \p Op is not aarch64_sve_cnt*.
19709static std::optional<ElementCount> getMaxValueForSVECntIntrinsic(SDValue Op) {
19710 Intrinsic::ID IID = getIntrinsicID(Op.getNode());
19711 if (IID == Intrinsic::aarch64_sve_cntp)
19712 return Op.getOperand(1).getValueType().getVectorElementCount();
19713 switch (IID) {
19714 case Intrinsic::aarch64_sve_cntd:
19715 return ElementCount::getScalable(2);
19716 case Intrinsic::aarch64_sve_cntw:
19717 return ElementCount::getScalable(4);
19718 case Intrinsic::aarch64_sve_cnth:
19719 return ElementCount::getScalable(8);
19720 case Intrinsic::aarch64_sve_cntb:
19721 return ElementCount::getScalable(16);
19722 default:
19723 return std::nullopt;
19724 }
19725}
19726
19727/// Calculates what the pre-extend type is, based on the extension
19728/// operation node provided by \p Extend.
19729///
19730/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
19731/// pre-extend type is pulled directly from the operand, while other extend
19732/// operations need a bit more inspection to get this information.
19733///
19734/// \param Extend The SDNode from the DAG that represents the extend operation
19735///
19736/// \returns The type representing the \p Extend source type, or \p MVT::Other
19737/// if no valid type can be determined
19739 switch (Extend.getOpcode()) {
19740 case ISD::SIGN_EXTEND:
19741 case ISD::ZERO_EXTEND:
19742 case ISD::ANY_EXTEND:
19743 return Extend.getOperand(0).getValueType();
19744 case ISD::AssertSext:
19745 case ISD::AssertZext:
19747 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
19748 if (!TypeNode)
19749 return MVT::Other;
19750 return TypeNode->getVT();
19751 }
19752 case ISD::AND: {
19755 if (!Constant)
19756 return MVT::Other;
19757
19758 uint32_t Mask = Constant->getZExtValue();
19759
19760 if (Mask == UCHAR_MAX)
19761 return MVT::i8;
19762 else if (Mask == USHRT_MAX)
19763 return MVT::i16;
19764 else if (Mask == UINT_MAX)
19765 return MVT::i32;
19766
19767 return MVT::Other;
19768 }
19769 default:
19770 return MVT::Other;
19771 }
19772}
19773
19774/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
19775/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
19776/// SExt/ZExt rather than the scalar SExt/ZExt
19778 EVT VT = BV.getValueType();
19779 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
19781 return SDValue();
19782
19783 // Use the first item in the buildvector/shuffle to get the size of the
19784 // extend, and make sure it looks valid.
19785 SDValue Extend = BV->getOperand(0);
19786 unsigned ExtendOpcode = Extend.getOpcode();
19787 bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
19788 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
19789 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
19790 ExtendOpcode == ISD::AssertSext;
19791 if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
19792 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
19793 return SDValue();
19794 // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
19795 // ensure calculatePreExtendType will work without issue.
19796 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
19797 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
19798 return SDValue();
19799
19800 // Restrict valid pre-extend data type
19801 EVT PreExtendType = calculatePreExtendType(Extend);
19802 if (PreExtendType == MVT::Other ||
19803 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
19804 return SDValue();
19805
19806 // Make sure all other operands are equally extended.
19807 bool SeenZExtOrSExt = !IsAnyExt;
19808 for (SDValue Op : drop_begin(BV->ops())) {
19809 if (Op.isUndef())
19810 continue;
19811
19812 if (calculatePreExtendType(Op) != PreExtendType)
19813 return SDValue();
19814
19815 unsigned Opc = Op.getOpcode();
19816 if (Opc == ISD::ANY_EXTEND)
19817 continue;
19818
19819 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
19821
19822 if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
19823 return SDValue();
19824
19825 IsSExt = OpcIsSExt;
19826 SeenZExtOrSExt = true;
19827 }
19828
19829 SDValue NBV;
19830 SDLoc DL(BV);
19831 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
19832 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
19833 EVT PreExtendLegalType =
19834 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
19836 for (SDValue Op : BV->ops())
19837 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
19838 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
19839 PreExtendLegalType));
19840 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
19841 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
19842 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
19843 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
19844 BV.getOperand(1).isUndef()
19845 ? DAG.getUNDEF(PreExtendVT)
19846 : BV.getOperand(1).getOperand(0),
19847 cast<ShuffleVectorSDNode>(BV)->getMask());
19848 }
19849 unsigned ExtOpc = !SeenZExtOrSExt
19851 : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
19852 return DAG.getNode(ExtOpc, DL, VT, NBV);
19853}
19854
19855/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
19856/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
19858 // If the value type isn't a vector, none of the operands are going to be dups
19859 EVT VT = Mul->getValueType(0);
19860 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19861 return SDValue();
19862
19863 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
19864 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
19865
19866 // Neither operands have been changed, don't make any further changes
19867 if (!Op0 && !Op1)
19868 return SDValue();
19869
19870 SDLoc DL(Mul);
19871 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
19872 Op1 ? Op1 : Mul->getOperand(1));
19873}
19874
19875// Multiplying an RDSVL value by a constant can sometimes be done cheaper by
19876// folding a power-of-two factor of the constant into the RDSVL immediate and
19877// compensating with an extra shift.
19878//
19879// We rewrite:
19880// (mul (srl (rdsvl 1), w), x)
19881// to one of:
19882// (shl (rdsvl y), z) if z > 0
19883// (srl (rdsvl y), abs(z)) if z < 0
19884// where integers y, z satisfy x = y * 2^(w + z) and y ∈ [-32, 31].
19886 SDLoc DL(Mul);
19887 EVT VT = Mul->getValueType(0);
19888 SDValue MulOp0 = Mul->getOperand(0);
19889 int ConstMultiplier =
19890 cast<ConstantSDNode>(Mul->getOperand(1))->getSExtValue();
19891 if ((MulOp0->getOpcode() != ISD::SRL) ||
19892 (MulOp0->getOperand(0).getOpcode() != AArch64ISD::RDSVL))
19893 return SDValue();
19894
19895 unsigned AbsConstValue = abs(ConstMultiplier);
19896 unsigned OperandShift =
19897 cast<ConstantSDNode>(MulOp0->getOperand(1))->getZExtValue();
19898
19899 // z ≤ ctz(|x|) - w (largest extra shift we can take while keeping y
19900 // integral)
19901 int UpperBound = llvm::countr_zero(AbsConstValue) - OperandShift;
19902
19903 // To keep y in range, with B = 31 for x > 0 and B = 32 for x < 0, we need:
19904 // 2^(w + z) ≥ ceil(x / B) ⇒ z ≥ ceil_log2(ceil(x / B)) - w (LowerBound).
19905 unsigned B = ConstMultiplier < 0 ? 32 : 31;
19906 unsigned CeilAxOverB = (AbsConstValue + (B - 1)) / B; // ceil(|x|/B)
19907 int LowerBound = llvm::Log2_32_Ceil(CeilAxOverB) - OperandShift;
19908
19909 // No valid solution found.
19910 if (LowerBound > UpperBound)
19911 return SDValue();
19912
19913 // Any value of z in [LowerBound, UpperBound] is valid. Prefer no extra
19914 // shift if possible.
19915 int Shift = std::min(std::max(/*prefer*/ 0, LowerBound), UpperBound);
19916
19917 // y = x / 2^(w + z)
19918 int32_t RdsvlMul = (AbsConstValue >> (OperandShift + Shift)) *
19919 (ConstMultiplier < 0 ? -1 : 1);
19920 auto Rdsvl = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
19921 DAG.getSignedConstant(RdsvlMul, DL, MVT::i32));
19922
19923 if (Shift == 0)
19924 return Rdsvl;
19925 return DAG.getNode(Shift < 0 ? ISD::SRL : ISD::SHL, DL, VT, Rdsvl,
19926 DAG.getConstant(abs(Shift), DL, MVT::i32),
19928}
19929
19930// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
19931// Same for other types with equivalent constants.
19933 EVT VT = N->getValueType(0);
19934 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
19935 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
19936 return SDValue();
19937 if (N->getOperand(0).getOpcode() != ISD::AND ||
19938 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
19939 return SDValue();
19940
19941 SDValue And = N->getOperand(0);
19942 SDValue Srl = And.getOperand(0);
19943
19944 APInt V1, V2, V3;
19945 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
19946 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
19948 return SDValue();
19949
19950 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
19951 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
19952 V3 != (HalfSize - 1))
19953 return SDValue();
19954
19955 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19956 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
19957 VT.getVectorElementCount() * 2);
19958
19959 SDLoc DL(N);
19960 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
19961 SDValue Zero = DAG.getConstant(0, DL, In.getValueType());
19962 SDValue CM = DAG.getSetCC(DL, HalfVT, Zero, In, ISD::SETGT);
19963 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
19964}
19965
19966// Transform vector add(zext i8 to i32, zext i8 to i32)
19967// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
19968// This allows extra uses of saddl/uaddl at the lower vector widths, and less
19969// extends.
19971 EVT VT = N->getValueType(0);
19972 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
19973 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
19974 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
19975 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
19976 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
19977 N->getOperand(0).getOperand(0).getValueType() !=
19978 N->getOperand(1).getOperand(0).getValueType())
19979 return SDValue();
19980
19981 if (N->getOpcode() == ISD::MUL &&
19982 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
19983 return SDValue();
19984
19985 SDValue N0 = N->getOperand(0).getOperand(0);
19986 SDValue N1 = N->getOperand(1).getOperand(0);
19987 EVT InVT = N0.getValueType();
19988
19989 EVT S1 = InVT.getScalarType();
19990 EVT S2 = VT.getScalarType();
19991 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19992 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19993 SDLoc DL(N);
19994 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19997 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
19998 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
19999 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
20000 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
20001 : (unsigned)ISD::SIGN_EXTEND,
20002 DL, VT, NewOp);
20003 }
20004 return SDValue();
20005}
20006
20009 const AArch64Subtarget *Subtarget) {
20010
20011 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
20012 return Ext;
20014 return Ext;
20015 if (SDValue Ext = performVectorExtCombine(N, DAG))
20016 return Ext;
20017
20018 if (DCI.isBeforeLegalizeOps())
20019 return SDValue();
20020
20021 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
20022 // and in MachineCombiner pass, add+mul will be combined into madd.
20023 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
20024 SDLoc DL(N);
20025 EVT VT = N->getValueType(0);
20026 SDValue N0 = N->getOperand(0);
20027 SDValue N1 = N->getOperand(1);
20028 SDValue MulOper;
20029 unsigned AddSubOpc;
20030
20031 auto IsAddSubWith1 = [&](SDValue V) -> bool {
20032 AddSubOpc = V->getOpcode();
20033 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
20034 SDValue Opnd = V->getOperand(1);
20035 MulOper = V->getOperand(0);
20036 if (AddSubOpc == ISD::SUB)
20037 std::swap(Opnd, MulOper);
20038 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
20039 return C->isOne();
20040 }
20041 return false;
20042 };
20043
20044 if (IsAddSubWith1(N0)) {
20045 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
20046 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
20047 }
20048
20049 if (IsAddSubWith1(N1)) {
20050 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
20051 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
20052 }
20053
20054 // The below optimizations require a constant RHS.
20055 if (!isa<ConstantSDNode>(N1))
20056 return SDValue();
20057
20058 if (SDValue Ext = performMulRdsvlCombine(N, DAG))
20059 return Ext;
20060
20062 const APInt &ConstValue = C->getAPIntValue();
20063
20064 // Allow the scaling to be folded into the `cnt` instruction by preventing
20065 // the scaling to be obscured here. This makes it easier to pattern match.
20066 if (IsSVECntIntrinsic(N0) ||
20067 (N0->getOpcode() == ISD::TRUNCATE &&
20068 (IsSVECntIntrinsic(N0->getOperand(0)))))
20069 if (ConstValue.sge(1) && ConstValue.sle(16))
20070 return SDValue();
20071
20072 // Multiplication of a power of two plus/minus one can be done more
20073 // cheaply as shift+add/sub. For now, this is true unilaterally. If
20074 // future CPUs have a cheaper MADD instruction, this may need to be
20075 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
20076 // 64-bit is 5 cycles, so this is always a win.
20077 // More aggressively, some multiplications N0 * C can be lowered to
20078 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
20079 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
20080 // TODO: lower more cases.
20081
20082 // TrailingZeroes is used to test if the mul can be lowered to
20083 // shift+add+shift.
20084 unsigned TrailingZeroes = ConstValue.countr_zero();
20085 if (TrailingZeroes) {
20086 // Conservatively do not lower to shift+add+shift if the mul might be
20087 // folded into smul or umul.
20088 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
20089 isZeroExtended(N0, DAG)))
20090 return SDValue();
20091 // Conservatively do not lower to shift+add+shift if the mul might be
20092 // folded into madd or msub.
20093 if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
20094 N->user_begin()->getOpcode() == ISD::SUB))
20095 return SDValue();
20096 }
20097 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
20098 // and shift+add+shift.
20099 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
20100 unsigned ShiftAmt;
20101
20102 auto Shl = [&](SDValue N0, unsigned N1) {
20103 if (!N0.getNode())
20104 return SDValue();
20105 // If shift causes overflow, ignore this combine.
20106 if (N1 >= N0.getValueSizeInBits())
20107 return SDValue();
20108 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
20109 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
20110 };
20111 auto Add = [&](SDValue N0, SDValue N1) {
20112 if (!N0.getNode() || !N1.getNode())
20113 return SDValue();
20114 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
20115 };
20116 auto Sub = [&](SDValue N0, SDValue N1) {
20117 if (!N0.getNode() || !N1.getNode())
20118 return SDValue();
20119 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
20120 };
20121 auto Negate = [&](SDValue N) {
20122 if (!N0.getNode())
20123 return SDValue();
20124 SDValue Zero = DAG.getConstant(0, DL, VT);
20125 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
20126 };
20127
20128 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
20129 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
20130 // the (2^N - 1) can't be execused via a single instruction.
20131 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
20132 unsigned BitWidth = C.getBitWidth();
20133 for (unsigned i = 1; i < BitWidth / 2; i++) {
20134 APInt Rem;
20135 APInt X(BitWidth, (1 << i) + 1);
20136 APInt::sdivrem(C, X, N, Rem);
20137 APInt NVMinus1 = N - 1;
20138 if (Rem == 0 && NVMinus1.isPowerOf2()) {
20139 M = X;
20140 return true;
20141 }
20142 }
20143 return false;
20144 };
20145
20146 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
20147 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
20148 // the (2^N - 1) can't be execused via a single instruction.
20149 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
20150 APInt CVMinus1 = C - 1;
20151 if (CVMinus1.isNegative())
20152 return false;
20153 unsigned TrailingZeroes = CVMinus1.countr_zero();
20154 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
20155 if (SCVMinus1.isPowerOf2()) {
20156 unsigned BitWidth = SCVMinus1.getBitWidth();
20157 M = APInt(BitWidth, SCVMinus1.logBase2());
20158 N = APInt(BitWidth, TrailingZeroes);
20159 return true;
20160 }
20161 return false;
20162 };
20163
20164 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
20165 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
20166 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
20167 APInt CVMinus1 = C - 1;
20168 if (CVMinus1.isNegative())
20169 return false;
20170 unsigned TrailingZeroes = CVMinus1.countr_zero();
20171 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
20172 if (CVPlus1.isPowerOf2()) {
20173 unsigned BitWidth = CVPlus1.getBitWidth();
20174 M = APInt(BitWidth, CVPlus1.logBase2());
20175 N = APInt(BitWidth, TrailingZeroes);
20176 return true;
20177 }
20178 return false;
20179 };
20180
20181 if (ConstValue.isNonNegative()) {
20182 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
20183 // (mul x, 2^N - 1) => (sub (shl x, N), x)
20184 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
20185 // (mul x, (2^M + 1) * (2^N + 1))
20186 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
20187 // (mul x, (2^M + 1) * 2^N + 1))
20188 // => MV = add (shl x, M), x); add (shl MV, N), x)
20189 // (mul x, 1 - (1 - 2^M) * 2^N))
20190 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
20191 APInt SCVMinus1 = ShiftedConstValue - 1;
20192 APInt SCVPlus1 = ShiftedConstValue + 1;
20193 APInt CVPlus1 = ConstValue + 1;
20194 APInt CVM, CVN;
20195 if (SCVMinus1.isPowerOf2()) {
20196 ShiftAmt = SCVMinus1.logBase2();
20197 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
20198 } else if (CVPlus1.isPowerOf2()) {
20199 ShiftAmt = CVPlus1.logBase2();
20200 return Sub(Shl(N0, ShiftAmt), N0);
20201 } else if (SCVPlus1.isPowerOf2()) {
20202 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
20203 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
20204 }
20205 if (Subtarget->hasALULSLFast() &&
20206 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
20207 APInt CVMMinus1 = CVM - 1;
20208 APInt CVNMinus1 = CVN - 1;
20209 unsigned ShiftM1 = CVMMinus1.logBase2();
20210 unsigned ShiftN1 = CVNMinus1.logBase2();
20211 // ALULSLFast implicate that Shifts <= 4 places are fast
20212 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
20213 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
20214 return Add(Shl(MVal, ShiftN1), MVal);
20215 }
20216 }
20217 if (Subtarget->hasALULSLFast() &&
20218 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
20219 unsigned ShiftM = CVM.getZExtValue();
20220 unsigned ShiftN = CVN.getZExtValue();
20221 // ALULSLFast implicate that Shifts <= 4 places are fast
20222 if (ShiftM <= 4 && ShiftN <= 4) {
20223 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
20224 return Add(Shl(MVal, CVN.getZExtValue()), N0);
20225 }
20226 }
20227
20228 if (Subtarget->hasALULSLFast() &&
20229 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
20230 unsigned ShiftM = CVM.getZExtValue();
20231 unsigned ShiftN = CVN.getZExtValue();
20232 // ALULSLFast implicate that Shifts <= 4 places are fast
20233 if (ShiftM <= 4 && ShiftN <= 4) {
20234 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
20235 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
20236 }
20237 }
20238 } else {
20239 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
20240 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
20241 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
20242 APInt SCVPlus1 = -ShiftedConstValue + 1;
20243 APInt CVNegPlus1 = -ConstValue + 1;
20244 APInt CVNegMinus1 = -ConstValue - 1;
20245 if (CVNegPlus1.isPowerOf2()) {
20246 ShiftAmt = CVNegPlus1.logBase2();
20247 return Sub(N0, Shl(N0, ShiftAmt));
20248 } else if (CVNegMinus1.isPowerOf2()) {
20249 ShiftAmt = CVNegMinus1.logBase2();
20250 return Negate(Add(Shl(N0, ShiftAmt), N0));
20251 } else if (SCVPlus1.isPowerOf2()) {
20252 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
20253 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
20254 }
20255 }
20256
20257 return SDValue();
20258}
20259
20261 SelectionDAG &DAG) {
20262 // Take advantage of vector comparisons producing 0 or -1 in each lane to
20263 // optimize away operation when it's from a constant.
20264 //
20265 // The general transformation is:
20266 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
20267 // AND(VECTOR_CMP(x,y), constant2)
20268 // constant2 = UNARYOP(constant)
20269
20270 // Early exit if this isn't a vector operation, the operand of the
20271 // unary operation isn't a bitwise AND, or if the sizes of the operations
20272 // aren't the same.
20273 EVT VT = N->getValueType(0);
20274 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
20275 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
20276 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
20277 return SDValue();
20278
20279 // Now check that the other operand of the AND is a constant. We could
20280 // make the transformation for non-constant splats as well, but it's unclear
20281 // that would be a benefit as it would not eliminate any operations, just
20282 // perform one more step in scalar code before moving to the vector unit.
20283 if (BuildVectorSDNode *BV =
20284 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
20285 // Bail out if the vector isn't a constant.
20286 if (!BV->isConstant())
20287 return SDValue();
20288
20289 // Everything checks out. Build up the new and improved node.
20290 SDLoc DL(N);
20291 EVT IntVT = BV->getValueType(0);
20292 // Create a new constant of the appropriate type for the transformed
20293 // DAG.
20294 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
20295 // The AND node needs bitcasts to/from an integer vector type around it.
20296 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
20297 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
20298 N->getOperand(0)->getOperand(0), MaskConst);
20299 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
20300 return Res;
20301 }
20302
20303 return SDValue();
20304}
20305
20306/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
20307/// functions, this can help to reduce the number of fmovs to/from GPRs.
20308static SDValue
20311 const AArch64Subtarget *Subtarget) {
20312 if (N->isStrictFPOpcode())
20313 return SDValue();
20314
20315 if (DCI.isBeforeLegalizeOps())
20316 return SDValue();
20317
20318 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
20319 (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
20320 return SDValue();
20321
20322 auto isSupportedType = [](EVT VT) {
20323 return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
20324 };
20325
20326 SDValue SrcVal = N->getOperand(0);
20327 EVT SrcTy = SrcVal.getValueType();
20328 EVT DestTy = N->getValueType(0);
20329
20330 if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
20331 return SDValue();
20332
20333 EVT SrcVecTy;
20334 EVT DestVecTy;
20335 if (DestTy.bitsGT(SrcTy)) {
20336 DestVecTy = getPackedSVEVectorVT(DestTy);
20337 SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
20338 } else {
20339 SrcVecTy = getPackedSVEVectorVT(SrcTy);
20340 DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
20341 }
20342
20343 // Ensure the resulting src/dest vector type is legal.
20344 if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
20345 return SDValue();
20346
20347 SDLoc DL(N);
20348 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
20349 SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
20350 DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
20351 SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
20352 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
20353}
20354
20357 const AArch64Subtarget *Subtarget) {
20358 // First try to optimize away the conversion when it's conditionally from
20359 // a constant. Vectors only.
20361 return Res;
20362
20363 if (SDValue Res =
20364 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
20365 return Res;
20366
20367 EVT VT = N->getValueType(0);
20368 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64)
20369 return SDValue();
20370 if (VT == MVT::f16 && !Subtarget->hasFullFP16())
20371 return SDValue();
20372
20373 // Only optimize when the source and destination types have the same width.
20374 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
20375 return SDValue();
20376
20377 // If the result of an integer load is only used by an integer-to-float
20378 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
20379 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
20380 SDValue N0 = N->getOperand(0);
20381 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
20382 N0.hasOneUse() &&
20383 // Do not change the width of a volatile load.
20384 !cast<LoadSDNode>(N0)->isVolatile()) {
20385 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
20386 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
20387 LN0->getPointerInfo(), LN0->getAlign(),
20388 LN0->getMemOperand()->getFlags());
20389
20390 // Make sure successors of the original load stay after it by updating them
20391 // to use the new Chain.
20392 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
20393
20394 unsigned Opcode =
20395 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
20396 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
20397 }
20398
20399 return SDValue();
20400}
20401
20402/// Fold a floating-point multiply by power of two into floating-point to
20403/// fixed-point conversion.
20406 const AArch64Subtarget *Subtarget) {
20407 if (SDValue Res =
20408 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
20409 return Res;
20410
20411 if (!Subtarget->isNeonAvailable())
20412 return SDValue();
20413
20414 if (!N->getValueType(0).isSimple())
20415 return SDValue();
20416
20417 SDValue Op = N->getOperand(0);
20418 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
20419 return SDValue();
20420
20421 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
20422 return SDValue();
20423
20424 SDValue ConstVec = Op->getOperand(1);
20425 if (!isa<BuildVectorSDNode>(ConstVec))
20426 return SDValue();
20427
20428 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
20429 uint32_t FloatBits = FloatTy.getSizeInBits();
20430 if (FloatBits != 32 && FloatBits != 64 &&
20431 (FloatBits != 16 || !Subtarget->hasFullFP16()))
20432 return SDValue();
20433
20434 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
20435 uint32_t IntBits = IntTy.getSizeInBits();
20436 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
20437 return SDValue();
20438
20439 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
20440 if (IntBits > FloatBits)
20441 return SDValue();
20442
20443 BitVector UndefElements;
20445 int32_t Bits = IntBits == 64 ? 64 : 32;
20446 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
20447 if (C == -1 || C == 0 || C > Bits)
20448 return SDValue();
20449
20450 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
20451 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
20452 return SDValue();
20453
20454 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
20455 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
20456 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
20457 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
20458 return SDValue();
20459 }
20460
20461 SDLoc DL(N);
20462 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
20463 N->getOpcode() == ISD::FP_TO_SINT_SAT);
20464 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
20465 : Intrinsic::aarch64_neon_vcvtfp2fxu;
20466 SDValue FixConv =
20468 DAG.getTargetConstant(IntrinsicOpcode, DL, MVT::i32),
20469 Op->getOperand(0), DAG.getTargetConstant(C, DL, MVT::i32));
20470 // We can handle smaller integers by generating an extra trunc.
20471 if (IntBits < FloatBits)
20472 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
20473
20474 return FixConv;
20475}
20476
20477// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
20478// convert to csel(ccmp(.., cc0)), depending on cc1:
20479
20480// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
20481// =>
20482// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
20483//
20484// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
20485// =>
20486// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
20488 EVT VT = N->getValueType(0);
20489 SDValue CSel0 = N->getOperand(0);
20490 SDValue CSel1 = N->getOperand(1);
20491
20492 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
20493 CSel1.getOpcode() != AArch64ISD::CSEL)
20494 return SDValue();
20495
20496 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
20497 return SDValue();
20498
20499 if (!isNullConstant(CSel0.getOperand(0)) ||
20500 !isOneConstant(CSel0.getOperand(1)) ||
20501 !isNullConstant(CSel1.getOperand(0)) ||
20502 !isOneConstant(CSel1.getOperand(1)))
20503 return SDValue();
20504
20505 SDValue Cmp0 = CSel0.getOperand(3);
20506 SDValue Cmp1 = CSel1.getOperand(3);
20509 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
20510 return SDValue();
20511 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
20512 Cmp0.getOpcode() == AArch64ISD::SUBS) {
20513 std::swap(Cmp0, Cmp1);
20514 std::swap(CC0, CC1);
20515 }
20516
20517 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
20518 return SDValue();
20519
20520 SDLoc DL(N);
20521 SDValue CCmp, Condition;
20522 unsigned NZCV;
20523
20524 if (N->getOpcode() == ISD::AND) {
20526 Condition = getCondCode(DAG, InvCC0);
20528 } else {
20530 Condition = getCondCode(DAG, CC0);
20532 }
20533
20534 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
20535
20536 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
20537 if (Op1 && Op1->getAPIntValue().isNegative() &&
20538 Op1->getAPIntValue().sgt(-32)) {
20539 // CCMP accept the constant int the range [0, 31]
20540 // if the Op1 is a constant in the range [-31, -1], we
20541 // can select to CCMN to avoid the extra mov
20542 SDValue AbsOp1 =
20543 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
20544 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, FlagsVT, Cmp1.getOperand(0),
20545 AbsOp1, NZCVOp, Condition, Cmp0);
20546 } else {
20547 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, FlagsVT, Cmp1.getOperand(0),
20548 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
20549 }
20550 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
20551 CSel0.getOperand(1), getCondCode(DAG, CC1), CCmp);
20552}
20553
20555 const AArch64Subtarget *Subtarget,
20556 const AArch64TargetLowering &TLI) {
20557 SelectionDAG &DAG = DCI.DAG;
20558
20559 if (SDValue R = performANDORCSELCombine(N, DAG))
20560 return R;
20561
20562 return SDValue();
20563}
20564
20566 if (!MemVT.getVectorElementType().isSimple())
20567 return false;
20568
20569 uint64_t MaskForTy = 0ull;
20570 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
20571 case MVT::i8:
20572 MaskForTy = 0xffull;
20573 break;
20574 case MVT::i16:
20575 MaskForTy = 0xffffull;
20576 break;
20577 case MVT::i32:
20578 MaskForTy = 0xffffffffull;
20579 break;
20580 default:
20581 return false;
20582 break;
20583 }
20584
20585 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
20586 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
20587 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
20588
20589 return false;
20590}
20591
20593 SDValue LeafOp = SDValue(N, 0);
20594 SDValue Op = N->getOperand(0);
20595 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
20596 LeafOp.getValueType() != Op.getValueType())
20597 Op = Op->getOperand(0);
20598 if (LeafOp.getValueType() == Op.getValueType())
20599 return Op;
20600 return SDValue();
20601}
20602
20605 SelectionDAG &DAG = DCI.DAG;
20606 SDValue Src = N->getOperand(0);
20607 unsigned Opc = Src->getOpcode();
20608
20609 // Zero/any extend of an unsigned unpack
20610 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
20611 SDValue UnpkOp = Src->getOperand(0);
20612 SDValue Dup = N->getOperand(1);
20613
20614 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
20615 return SDValue();
20616
20617 SDLoc DL(N);
20619 if (!C)
20620 return SDValue();
20621
20622 uint64_t ExtVal = C->getZExtValue();
20623
20624 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
20625 return ((ExtVal == 0xFF && VT == MVT::i8) ||
20626 (ExtVal == 0xFFFF && VT == MVT::i16) ||
20627 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
20628 };
20629
20630 // If the mask is fully covered by the unpack, we don't need to push
20631 // a new AND onto the operand
20632 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
20633 if (MaskAndTypeMatch(EltTy))
20634 return Src;
20635
20636 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
20637 // to see if the mask is all-ones of size MemTy.
20638 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
20639 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
20640 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
20641 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
20642 if (MaskAndTypeMatch(EltTy))
20643 return Src;
20644 }
20645
20646 // Truncate to prevent a DUP with an over wide constant
20647 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
20648
20649 // Otherwise, make sure we propagate the AND to the operand
20650 // of the unpack
20651 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
20652 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
20653
20654 SDValue And = DAG.getNode(ISD::AND, DL,
20655 UnpkOp->getValueType(0), UnpkOp, Dup);
20656
20657 return DAG.getNode(Opc, DL, N->getValueType(0), And);
20658 }
20659
20660 if (DCI.isBeforeLegalizeOps())
20661 return SDValue();
20662
20663 // If both sides of AND operations are i1 splat_vectors then
20664 // we can produce just i1 splat_vector as the result.
20665 if (isAllActivePredicate(DAG, N->getOperand(0)))
20666 return N->getOperand(1);
20667 if (isAllActivePredicate(DAG, N->getOperand(1)))
20668 return N->getOperand(0);
20669
20671 return SDValue();
20672
20673 SDValue Mask = N->getOperand(1);
20674
20675 if (!Src.hasOneUse())
20676 return SDValue();
20677
20678 EVT MemVT;
20679
20680 // SVE load instructions perform an implicit zero-extend, which makes them
20681 // perfect candidates for combining.
20682 switch (Opc) {
20683 case AArch64ISD::LD1_MERGE_ZERO:
20684 case AArch64ISD::LDNF1_MERGE_ZERO:
20685 case AArch64ISD::LDFF1_MERGE_ZERO:
20686 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
20687 break;
20688 case AArch64ISD::GLD1_MERGE_ZERO:
20689 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
20690 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
20691 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
20692 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
20693 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
20694 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
20695 case AArch64ISD::GLDFF1_MERGE_ZERO:
20696 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
20697 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
20698 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
20699 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
20700 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
20701 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
20702 case AArch64ISD::GLDNT1_MERGE_ZERO:
20703 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
20704 break;
20705 default:
20706 return SDValue();
20707 }
20708
20709 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
20710 return Src;
20711
20712 return SDValue();
20713}
20714
20715// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
20718
20719 // This function performs an optimization on a specific pattern involving
20720 // an AND operation and SETCC (Set Condition Code) node.
20721
20722 SDValue SetCC = N->getOperand(0);
20723 EVT VT = N->getValueType(0);
20724 SelectionDAG &DAG = DCI.DAG;
20725
20726 // Checks if the current node (N) is used by any SELECT instruction and
20727 // returns an empty SDValue to avoid applying the optimization to prevent
20728 // incorrect results
20729 for (auto U : N->users())
20730 if (U->getOpcode() == ISD::SELECT)
20731 return SDValue();
20732
20733 // Check if the operand is a SETCC node with floating-point comparison
20734 if (SetCC.getOpcode() == ISD::SETCC &&
20735 SetCC.getOperand(0).getValueType() == MVT::f32) {
20736
20737 SDValue Cmp;
20739
20740 // Check if the DAG is after legalization and if we can emit the conjunction
20741 if (!DCI.isBeforeLegalize() &&
20742 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
20743
20745
20746 SDLoc DL(N);
20747 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
20748 DAG.getConstant(0, DL, VT),
20749 getCondCode(DAG, InvertedCC), Cmp);
20750 }
20751 }
20752 return SDValue();
20753}
20754
20757 SelectionDAG &DAG = DCI.DAG;
20758 SDValue LHS = N->getOperand(0);
20759 SDValue RHS = N->getOperand(1);
20760 EVT VT = N->getValueType(0);
20761
20762 if (SDValue R = performANDORCSELCombine(N, DAG))
20763 return R;
20764
20765 if (SDValue R = performANDSETCCCombine(N,DCI))
20766 return R;
20767
20768 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
20769 return SDValue();
20770
20771 if (VT.isScalableVector())
20772 return performSVEAndCombine(N, DCI);
20773
20774 // The combining code below works only for NEON vectors. In particular, it
20775 // does not work for SVE when dealing with vectors wider than 128 bits.
20776 if (!VT.is64BitVector() && !VT.is128BitVector())
20777 return SDValue();
20778
20780 if (!BVN)
20781 return SDValue();
20782
20783 // AND does not accept an immediate, so check if we can use a BIC immediate
20784 // instruction instead. We do this here instead of using a (and x, (mvni imm))
20785 // pattern in isel, because some immediates may be lowered to the preferred
20786 // (and x, (movi imm)) form, even though an mvni representation also exists.
20787 APInt DefBits(VT.getSizeInBits(), 0);
20788 APInt UndefBits(VT.getSizeInBits(), 0);
20789 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
20790 SDValue NewOp;
20791
20792 // Any bits known to already be 0 need not be cleared again, which can help
20793 // reduce the size of the immediate to one supported by the instruction.
20794 KnownBits Known = DAG.computeKnownBits(LHS);
20795 APInt ZeroSplat(VT.getSizeInBits(), 0);
20796 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
20797 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
20798 << (Known.Zero.getBitWidth() * I);
20799
20800 DefBits = ~(DefBits | ZeroSplat);
20801 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
20802 DefBits, &LHS)) ||
20803 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
20804 DefBits, &LHS)))
20805 return NewOp;
20806
20807 UndefBits = ~(UndefBits | ZeroSplat);
20808 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
20809 UndefBits, &LHS)) ||
20810 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
20811 UndefBits, &LHS)))
20812 return NewOp;
20813 }
20814
20815 return SDValue();
20816}
20817
20820 SelectionDAG &DAG = DCI.DAG;
20821 SDValue LHS = N->getOperand(0);
20822 SDValue RHS = N->getOperand(1);
20823 EVT VT = N->getValueType(0);
20824 SDLoc DL(N);
20825
20826 if (!N->getFlags().hasAllowReassociation())
20827 return SDValue();
20828
20829 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
20830 auto ReassocComplex = [&](SDValue A, SDValue B) {
20831 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
20832 return SDValue();
20833 unsigned Opc = A.getConstantOperandVal(0);
20834 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
20835 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
20836 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
20837 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
20838 return SDValue();
20839 SDValue VCMLA = DAG.getNode(
20840 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
20841 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
20842 A.getOperand(2), A.getOperand(3));
20843 VCMLA->setFlags(A->getFlags());
20844 return VCMLA;
20845 };
20846 if (SDValue R = ReassocComplex(LHS, RHS))
20847 return R;
20848 if (SDValue R = ReassocComplex(RHS, LHS))
20849 return R;
20850
20851 return SDValue();
20852}
20853
20854static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
20855 switch (Opcode) {
20856 case ISD::STRICT_FADD:
20857 case ISD::FADD:
20858 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
20859 case ISD::ADD:
20860 return VT == MVT::i64;
20861 default:
20862 return false;
20863 }
20864}
20865
20866static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
20868
20870 if ((N.getOpcode() == ISD::SETCC) ||
20871 // get_active_lane_mask is lowered to a whilelo instruction.
20872 (N.getOpcode() == ISD::GET_ACTIVE_LANE_MASK) ||
20873 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
20874 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
20875 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege_x2 ||
20876 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
20877 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt_x2 ||
20878 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
20879 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi_x2 ||
20880 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
20881 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs_x2 ||
20882 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
20883 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele_x2 ||
20884 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
20885 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo_x2 ||
20886 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
20887 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels_x2 ||
20888 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
20889 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt_x2)))
20890 return true;
20891
20892 return false;
20893}
20894
20895// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
20896// ... into: "ptrue p, all" + PTEST
20897static SDValue
20900 const AArch64Subtarget *Subtarget) {
20901 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20902 // Make sure PTEST can be legalised with illegal types.
20903 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
20904 return SDValue();
20905
20906 SDValue N0 = N->getOperand(0);
20907 EVT VT = N0.getValueType();
20908
20909 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
20910 !isNullConstant(N->getOperand(1)))
20911 return SDValue();
20912
20913 // Restricted the DAG combine to only cases where we're extracting from a
20914 // flag-setting operation.
20915 if (!isPredicateCCSettingOp(N0) || N0.getResNo() != 0)
20916 return SDValue();
20917
20918 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
20919 SelectionDAG &DAG = DCI.DAG;
20920 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
20921 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
20922}
20923
20924// Materialize : Idx = (add (mul vscale, NumEls), -1)
20925// i1 = extract_vector_elt t37, Constant:i64<Idx>
20926// ... into: "ptrue p, all" + PTEST
20927static SDValue
20930 const AArch64Subtarget *Subtarget) {
20931 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20932 // Make sure PTEST is legal types.
20933 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
20934 return SDValue();
20935
20936 SDValue N0 = N->getOperand(0);
20937 EVT OpVT = N0.getValueType();
20938
20939 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
20940 return SDValue();
20941
20942 // Idx == (add (mul vscale, NumEls), -1)
20943 SDValue Idx = N->getOperand(1);
20944 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
20945 return SDValue();
20946
20947 SDValue VS = Idx.getOperand(0);
20948 if (VS.getOpcode() != ISD::VSCALE)
20949 return SDValue();
20950
20951 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
20952 if (VS.getConstantOperandVal(0) != NumEls)
20953 return SDValue();
20954
20955 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
20956 SelectionDAG &DAG = DCI.DAG;
20957 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
20958 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
20959}
20960
20961static SDValue
20963 const AArch64Subtarget *Subtarget) {
20964 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20965 SelectionDAG &DAG = DCI.DAG;
20966 SDValue Vec = N->getOperand(0);
20967 SDValue Idx = N->getOperand(1);
20968
20969 if (DCI.isBeforeLegalize() || Idx.getOpcode() != ISD::VECTOR_FIND_LAST_ACTIVE)
20970 return SDValue();
20971
20972 // Only legal for 8, 16, 32, and 64 bit element types.
20973 EVT EltVT = Vec.getValueType().getVectorElementType();
20974 if (!is_contained(ArrayRef({MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f16,
20975 MVT::bf16, MVT::f32, MVT::f64}),
20976 EltVT.getSimpleVT().SimpleTy))
20977 return SDValue();
20978
20979 SDValue Mask = Idx.getOperand(0);
20980 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20981 if (!TLI.isOperationLegal(ISD::VECTOR_FIND_LAST_ACTIVE, Mask.getValueType()))
20982 return SDValue();
20983
20984 return DAG.getNode(AArch64ISD::LASTB, SDLoc(N), N->getValueType(0), Mask,
20985 Vec);
20986}
20987
20988static SDValue
20990 const AArch64Subtarget *Subtarget) {
20991 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20992 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
20993 return Res;
20994 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
20995 return Res;
20996 if (SDValue Res = performExtractLastActiveCombine(N, DCI, Subtarget))
20997 return Res;
20998
20999 SelectionDAG &DAG = DCI.DAG;
21000 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
21001
21002 EVT VT = N->getValueType(0);
21003 const bool FullFP16 = Subtarget->hasFullFP16();
21004 bool IsStrict = N0->isStrictFPOpcode();
21005
21006 // extract(dup x) -> x
21007 if (N0.getOpcode() == AArch64ISD::DUP)
21008 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
21009 : N0.getOperand(0);
21010
21011 // Rewrite for pairwise fadd pattern
21012 // (f32 (extract_vector_elt
21013 // (fadd (vXf32 Other)
21014 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
21015 // ->
21016 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
21017 // (extract_vector_elt (vXf32 Other) 1))
21018 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
21019 // we can only do this when it's used only by the extract_vector_elt.
21020 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
21021 (!IsStrict || N0.hasOneUse())) {
21022 SDLoc DL(N0);
21023 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
21024 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
21025
21027 SDValue Other = N00;
21028
21029 // And handle the commutative case.
21030 if (!Shuffle) {
21031 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
21032 Other = N01;
21033 }
21034
21035 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
21036 Other == Shuffle->getOperand(0)) {
21037 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
21038 DAG.getConstant(0, DL, MVT::i64));
21039 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
21040 DAG.getConstant(1, DL, MVT::i64));
21041 if (!IsStrict)
21042 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
21043
21044 // For strict_fadd we need uses of the final extract_vector to be replaced
21045 // with the strict_fadd, but we also need uses of the chain output of the
21046 // original strict_fadd to use the chain output of the new strict_fadd as
21047 // otherwise it may not be deleted.
21048 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
21049 {VT, MVT::Other},
21050 {N0->getOperand(0), Extract1, Extract2});
21051 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
21052 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
21053 return SDValue(N, 0);
21054 }
21055 }
21056
21057 // Given an extract(load) or extract(extend(load)), produce a scalar load
21058 // instead to avoid the cross-register-bank copies.
21059 if (DCI.isAfterLegalizeDAG() && Subtarget->isLittleEndian() &&
21060 VT.isInteger() && isa<ConstantSDNode>(N1)) {
21061 SDValue LoadN0 = N0;
21062 // Look through sext/zext and extract_subvector / insert_subvector if
21063 // required.
21064 if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
21065 N0.getOpcode() == ISD::SIGN_EXTEND ||
21066 N0.getOpcode() == ISD::ANY_EXTEND) &&
21067 N0.getOperand(0).hasOneUse())
21068 LoadN0 = N0.getOperand(0);
21069 unsigned OffsetElts = 0;
21070 if (LoadN0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
21071 OffsetElts = LoadN0.getConstantOperandVal(1);
21072 LoadN0 = LoadN0.getOperand(0);
21073 }
21074 if (LoadN0.getOpcode() == ISD::INSERT_SUBVECTOR &&
21075 LoadN0.getOperand(0).isUndef() &&
21076 isNullConstant(LoadN0.getOperand(2)) &&
21077 LoadN0.getOperand(1).hasOneUse())
21078 LoadN0 = LoadN0.getOperand(1);
21079
21080 // Check all the uses are valid and can be scalarized. We check that all the
21081 // uses are extracts and those extracts are not re-inserted into an
21082 // operation best treated as a vector register.
21083 auto Load = dyn_cast<LoadSDNode>(LoadN0);
21084 if (Load && Load->isSimple() && ISD::isNormalLoad(Load) &&
21085 Load->getMemoryVT().isByteSized() &&
21086 all_of(N0->uses(), [&](const SDUse &U) {
21087 return U.getResNo() != N0.getResNo() ||
21088 (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21089 !any_of(U.getUser()->uses(), [](const SDUse &U2) {
21090 return U2.getUser()->getOpcode() ==
21091 ISD::INSERT_VECTOR_ELT ||
21092 U2.getUser()->getOpcode() == ISD::BUILD_VECTOR ||
21093 U2.getUser()->getOpcode() == ISD::SCALAR_TO_VECTOR;
21094 }));
21095 })) {
21096
21097 SDLoc DL(Load);
21098
21099 // Generate a new scalar load.
21100 unsigned Offset = (OffsetElts + N->getConstantOperandVal(1)) *
21101 Load->getValueType(0).getScalarSizeInBits() / 8;
21102 SDValue BasePtr = DAG.getObjectPtrOffset(
21103 DL, Load->getBasePtr(), DAG.getConstant(Offset, DL, MVT::i64));
21104 ISD::LoadExtType ExtType =
21108 : ISD::EXTLOAD);
21109 SDValue ScalarLoad =
21110 DAG.getExtLoad(ExtType, DL, VT, Load->getChain(), BasePtr,
21111 Load->getPointerInfo().getWithOffset(Offset),
21112 Load->getValueType(0).getScalarType(),
21113 commonAlignment(Load->getAlign(), Offset),
21114 Load->getMemOperand()->getFlags(), Load->getAAInfo());
21115 DAG.makeEquivalentMemoryOrdering(Load, ScalarLoad);
21116 return ScalarLoad;
21117 }
21118 }
21119
21120 return SDValue();
21121}
21122
21125 SelectionDAG &DAG) {
21126 SDLoc DL(N);
21127 EVT VT = N->getValueType(0);
21128 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
21129 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
21130
21131 if (VT.isScalableVector())
21132 return SDValue();
21133
21134 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
21135 N1Opc == ISD::TRUNCATE) {
21136 SDValue N00 = N0->getOperand(0);
21137 SDValue N10 = N1->getOperand(0);
21138 EVT N00VT = N00.getValueType();
21139 unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();
21140
21141 // Optimize concat_vectors of truncated vectors, where the intermediate
21142 // type is illegal, to avoid said illegality, e.g.,
21143 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
21144 // (v2i16 (truncate (v2i64)))))
21145 // ->
21146 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
21147 // (v4i32 (bitcast (v2i64))),
21148 // <0, 2, 4, 6>)))
21149 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
21150 // on both input and result type, so we might generate worse code.
21151 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
21152 if (N00VT == N10.getValueType() &&
21153 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
21154 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
21155 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
21157 for (size_t i = 0; i < Mask.size(); ++i)
21158 Mask[i] = i * 2;
21159 return DAG.getNode(ISD::TRUNCATE, DL, VT,
21160 DAG.getVectorShuffle(
21161 MidVT, DL,
21162 DAG.getNode(ISD::BITCAST, DL, MidVT, N00),
21163 DAG.getNode(ISD::BITCAST, DL, MidVT, N10), Mask));
21164 }
21165
21166 // Optimize two large shifts and a combine into a single combine and shift
21167 // For AArch64 architectures, sequences like the following:
21168 //
21169 // ushr v0.4s, v0.4s, #20
21170 // ushr v1.4s, v1.4s, #20
21171 // uzp1 v0.8h, v0.8h, v1.8h
21172 //
21173 // Can be optimized to:
21174 //
21175 // uzp2 v0.8h, v0.8h, v1.8h
21176 // ushr v0.8h, v0.8h, #4
21177 //
21178 // This optimization reduces instruction count.
21179 if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
21180 N00->getOperand(1) == N10->getOperand(1)) {
21181 SDValue N000 = N00->getOperand(0);
21182 SDValue N100 = N10->getOperand(0);
21183 uint64_t N001ConstVal = N00->getConstantOperandVal(1),
21184 N101ConstVal = N10->getConstantOperandVal(1),
21185 NScalarSize = N->getValueType(0).getScalarSizeInBits();
21186
21187 if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
21188 N000 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N000);
21189 N100 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N100);
21190 SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, DL, VT, N000, N100);
21191 SDValue NewShiftConstant =
21192 DAG.getTargetConstant(N001ConstVal - NScalarSize, DL, MVT::i32);
21193
21194 return DAG.getNode(AArch64ISD::VLSHR, DL, VT, Uzp, NewShiftConstant);
21195 }
21196 }
21197 }
21198
21199 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
21200 N->getOperand(0).getValueType() == MVT::v2i16 ||
21201 N->getOperand(0).getValueType() == MVT::v2i8) {
21202 EVT SrcVT = N->getOperand(0).getValueType();
21203 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
21204 // loads to prevent having to go through the v4i8 load legalization that
21205 // needs to extend each element into a larger type.
21206 if (N->getNumOperands() % 2 == 0 &&
21207 all_of(N->op_values(), [SrcVT](SDValue V) {
21208 if (V.getValueType() != SrcVT)
21209 return false;
21210 if (V.isUndef())
21211 return true;
21212 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
21213 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
21214 LD->getExtensionType() == ISD::NON_EXTLOAD;
21215 })) {
21216 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
21217 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
21219
21220 for (unsigned i = 0; i < N->getNumOperands(); i++) {
21221 SDValue V = N->getOperand(i);
21222 if (V.isUndef())
21223 Ops.push_back(DAG.getUNDEF(FVT));
21224 else {
21226 SDValue NewLoad = DAG.getLoad(FVT, DL, LD->getChain(),
21227 LD->getBasePtr(), LD->getMemOperand());
21228 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
21229 Ops.push_back(NewLoad);
21230 }
21231 }
21232 return DAG.getBitcast(N->getValueType(0),
21233 DAG.getBuildVector(NVT, DL, Ops));
21234 }
21235 }
21236
21237 // Canonicalise concat_vectors to replace concatenations of truncated nots
21238 // with nots of concatenated truncates. This in some cases allows for multiple
21239 // redundant negations to be eliminated.
21240 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
21241 // (v4i16 (truncate (not (v4i32)))))
21242 // ->
21243 // (not (concat_vectors (v4i16 (truncate (v4i32))),
21244 // (v4i16 (truncate (v4i32)))))
21245 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
21246 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
21247 N->isOnlyUserOf(N1.getNode())) {
21248 auto isBitwiseVectorNegate = [](SDValue V) {
21249 return V->getOpcode() == ISD::XOR &&
21250 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
21251 };
21252 SDValue N00 = N0->getOperand(0);
21253 SDValue N10 = N1->getOperand(0);
21254 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
21255 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
21256 return DAG.getNOT(
21257 DL,
21260 N00->getOperand(0)),
21262 N10->getOperand(0))),
21263 VT);
21264 }
21265 }
21266
21267 // Wait till after everything is legalized to try this. That way we have
21268 // legal vector types and such.
21269 if (DCI.isBeforeLegalizeOps())
21270 return SDValue();
21271
21272 // Optimise concat_vectors of two identical binops with a 128-bit destination
21273 // size, combine into an binop of two contacts of the source vectors. eg:
21274 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
21275 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
21276 (DAG.getTargetLoweringInfo().isBinOp(N0Opc) ||
21277 isVectorizedBinOp(N0Opc)) &&
21278 N0->hasOneUse() && N1->hasOneUse()) {
21279 SDValue N00 = N0->getOperand(0);
21280 SDValue N01 = N0->getOperand(1);
21281 SDValue N10 = N1->getOperand(0);
21282 SDValue N11 = N1->getOperand(1);
21283
21284 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
21285 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N00, N10);
21286 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N01, N11);
21287 return DAG.getNode(N0Opc, DL, VT, Concat0, Concat1);
21288 }
21289 }
21290
21291 auto IsRSHRN = [](SDValue Shr) {
21292 if (Shr.getOpcode() != AArch64ISD::VLSHR)
21293 return false;
21294 SDValue Op = Shr.getOperand(0);
21295 EVT VT = Op.getValueType();
21296 unsigned ShtAmt = Shr.getConstantOperandVal(1);
21297 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
21298 return false;
21299
21300 APInt Imm;
21301 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
21302 Imm = APInt(VT.getScalarSizeInBits(),
21303 Op.getOperand(1).getConstantOperandVal(0)
21304 << Op.getOperand(1).getConstantOperandVal(1));
21305 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
21306 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
21307 Imm = APInt(VT.getScalarSizeInBits(),
21308 Op.getOperand(1).getConstantOperandVal(0));
21309 else
21310 return false;
21311
21312 if (Imm != 1ULL << (ShtAmt - 1))
21313 return false;
21314 return true;
21315 };
21316
21317 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
21318 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
21319 ((IsRSHRN(N1) &&
21321 N1.isUndef())) {
21322 SDValue X = N0.getOperand(0).getOperand(0);
21323 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
21324 : N1.getOperand(0).getOperand(0);
21325 EVT BVT =
21326 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
21327 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, DL, BVT, X, Y);
21328 SDValue Add = DAG.getNode(
21329 ISD::ADD, DL, BVT, CC,
21330 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), DL, BVT));
21331 SDValue Shr =
21332 DAG.getNode(AArch64ISD::VLSHR, DL, BVT, Add, N0.getOperand(1));
21333 return Shr;
21334 }
21335
21336 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
21337 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
21338 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
21339 N0.getOperand(1) == N1.getOperand(1)) {
21340 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
21341 DAG.getUNDEF(N0.getValueType()));
21342 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(1),
21343 DAG.getUNDEF(N0.getValueType()));
21344 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, E0, E1);
21345 }
21346
21347 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
21348 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
21349 // canonicalise to that.
21350 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
21351 assert(VT.getScalarSizeInBits() == 64);
21352 return DAG.getNode(AArch64ISD::DUPLANE64, DL, VT, WidenVector(N0, DAG),
21353 DAG.getConstant(0, DL, MVT::i64));
21354 }
21355
21356 // Canonicalise concat_vectors so that the right-hand vector has as few
21357 // bit-casts as possible before its real operation. The primary matching
21358 // destination for these operations will be the narrowing "2" instructions,
21359 // which depend on the operation being performed on this right-hand vector.
21360 // For example,
21361 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
21362 // becomes
21363 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
21364
21365 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
21366 return SDValue();
21367 SDValue RHS = N1->getOperand(0);
21368 MVT RHSTy = RHS.getValueType().getSimpleVT();
21369 // If the RHS is not a vector, this is not the pattern we're looking for.
21370 if (!RHSTy.isVector())
21371 return SDValue();
21372
21373 LLVM_DEBUG(
21374 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
21375
21376 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
21377 RHSTy.getVectorNumElements() * 2);
21378 return DAG.getNode(ISD::BITCAST, DL, VT,
21379 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatTy,
21380 DAG.getNode(ISD::BITCAST, DL, RHSTy, N0),
21381 RHS));
21382}
21383
21384static SDValue
21386 SelectionDAG &DAG) {
21387 if (DCI.isBeforeLegalizeOps())
21388 return SDValue();
21389
21390 EVT VT = N->getValueType(0);
21391 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
21392 return SDValue();
21393
21394 SDValue V = N->getOperand(0);
21395
21396 // NOTE: This combine exists in DAGCombiner, but that version's legality check
21397 // blocks this combine because the non-const case requires custom lowering.
21398 //
21399 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
21400 if (V.getOpcode() == ISD::SPLAT_VECTOR)
21401 if (isa<ConstantSDNode>(V.getOperand(0)))
21402 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
21403
21404 return SDValue();
21405}
21406
21407static SDValue
21409 SelectionDAG &DAG) {
21410 SDLoc DL(N);
21411 SDValue Vec = N->getOperand(0);
21412 SDValue SubVec = N->getOperand(1);
21413 uint64_t IdxVal = N->getConstantOperandVal(2);
21414 EVT VecVT = Vec.getValueType();
21415 EVT SubVT = SubVec.getValueType();
21416
21417 // Promote fixed length vector zeros.
21418 if (VecVT.isScalableVector() && SubVT.isFixedLengthVector() &&
21419 Vec.isUndef() && isZerosVector(SubVec.getNode()))
21420 return VecVT.isInteger() ? DAG.getConstant(0, DL, VecVT)
21421 : DAG.getConstantFP(0, DL, VecVT);
21422
21423 // Only do this for legal fixed vector types.
21424 if (!VecVT.isFixedLengthVector() ||
21425 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
21426 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
21427 return SDValue();
21428
21429 // Ignore widening patterns.
21430 if (IdxVal == 0 && Vec.isUndef())
21431 return SDValue();
21432
21433 // Subvector must be half the width and an "aligned" insertion.
21434 unsigned NumSubElts = SubVT.getVectorNumElements();
21435 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
21436 (IdxVal != 0 && IdxVal != NumSubElts))
21437 return SDValue();
21438
21439 // Fold insert_subvector -> concat_vectors
21440 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
21441 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
21442 SDValue Lo, Hi;
21443 if (IdxVal == 0) {
21444 Lo = SubVec;
21445 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
21446 DAG.getVectorIdxConstant(NumSubElts, DL));
21447 } else {
21448 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
21449 DAG.getVectorIdxConstant(0, DL));
21450 Hi = SubVec;
21451 }
21452 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
21453}
21454
21457 SelectionDAG &DAG) {
21458 // Wait until after everything is legalized to try this. That way we have
21459 // legal vector types and such.
21460 if (DCI.isBeforeLegalizeOps())
21461 return SDValue();
21462 // Transform a scalar conversion of a value from a lane extract into a
21463 // lane extract of a vector conversion. E.g., from foo1 to foo2:
21464 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
21465 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
21466 //
21467 // The second form interacts better with instruction selection and the
21468 // register allocator to avoid cross-class register copies that aren't
21469 // coalescable due to a lane reference.
21470
21471 // Check the operand and see if it originates from a lane extract.
21472 SDValue Op1 = N->getOperand(1);
21474 return SDValue();
21475
21476 // Yep, no additional predication needed. Perform the transform.
21477 SDValue IID = N->getOperand(0);
21478 SDValue Shift = N->getOperand(2);
21479 SDValue Vec = Op1.getOperand(0);
21480 SDValue Lane = Op1.getOperand(1);
21481 EVT ResTy = N->getValueType(0);
21482 EVT VecResTy;
21483 SDLoc DL(N);
21484
21485 // The vector width should be 128 bits by the time we get here, even
21486 // if it started as 64 bits (the extract_vector handling will have
21487 // done so). Bail if it is not.
21488 if (Vec.getValueSizeInBits() != 128)
21489 return SDValue();
21490
21491 if (Vec.getValueType() == MVT::v4i32)
21492 VecResTy = MVT::v4f32;
21493 else if (Vec.getValueType() == MVT::v2i64)
21494 VecResTy = MVT::v2f64;
21495 else
21496 return SDValue();
21497
21498 SDValue Convert =
21499 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
21500 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
21501}
21502
21503// AArch64 high-vector "long" operations are formed by performing the non-high
21504// version on an extract_subvector of each operand which gets the high half:
21505//
21506// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
21507//
21508// However, there are cases which don't have an extract_high explicitly, but
21509// have another operation that can be made compatible with one for free. For
21510// example:
21511//
21512// (dupv64 scalar) --> (extract_high (dup128 scalar))
21513//
21514// This routine does the actual conversion of such DUPs, once outer routines
21515// have determined that everything else is in order.
21516// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
21517// similarly here.
21519 MVT VT = N.getSimpleValueType();
21520 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
21521 N.getConstantOperandVal(1) == 0)
21522 N = N.getOperand(0);
21523
21524 switch (N.getOpcode()) {
21525 case AArch64ISD::DUP:
21526 case AArch64ISD::DUPLANE8:
21527 case AArch64ISD::DUPLANE16:
21528 case AArch64ISD::DUPLANE32:
21529 case AArch64ISD::DUPLANE64:
21530 case AArch64ISD::MOVI:
21531 case AArch64ISD::MOVIshift:
21532 case AArch64ISD::MOVIedit:
21533 case AArch64ISD::MOVImsl:
21534 case AArch64ISD::MVNIshift:
21535 case AArch64ISD::MVNImsl:
21536 break;
21537 default:
21538 // FMOV could be supported, but isn't very useful, as it would only occur
21539 // if you passed a bitcast' floating point immediate to an eligible long
21540 // integer op (addl, smull, ...).
21541 return SDValue();
21542 }
21543
21544 if (!VT.is64BitVector())
21545 return SDValue();
21546
21547 SDLoc DL(N);
21548 unsigned NumElems = VT.getVectorNumElements();
21549 if (N.getValueType().is64BitVector()) {
21550 MVT ElementTy = VT.getVectorElementType();
21551 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
21552 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
21553 }
21554
21555 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
21556 DAG.getConstant(NumElems, DL, MVT::i64));
21557}
21558
21560 if (N.getOpcode() == ISD::BITCAST)
21561 N = N.getOperand(0);
21562 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21563 return false;
21564 if (N.getOperand(0).getValueType().isScalableVector())
21565 return false;
21566 return N.getConstantOperandAPInt(1) ==
21567 N.getOperand(0).getValueType().getVectorNumElements() / 2;
21568}
21569
21570/// Helper structure to keep track of ISD::SET_CC operands.
21576
21577/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
21582
21583/// Helper structure to keep track of SetCC information.
21588
21589/// Helper structure to be able to read SetCC information. If set to
21590/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
21591/// GenericSetCCInfo.
21596
21597/// Check whether or not \p Op is a SET_CC operation, either a generic or
21598/// an
21599/// AArch64 lowered one.
21600/// \p SetCCInfo is filled accordingly.
21601/// \post SetCCInfo is meanginfull only when this function returns true.
21602/// \return True when Op is a kind of SET_CC operation.
21604 // If this is a setcc, this is straight forward.
21605 if (Op.getOpcode() == ISD::SETCC) {
21606 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
21607 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
21608 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
21609 SetCCInfo.IsAArch64 = false;
21610 return true;
21611 }
21612 // Otherwise, check if this is a matching csel instruction.
21613 // In other words:
21614 // - csel 1, 0, cc
21615 // - csel 0, 1, !cc
21616 if (Op.getOpcode() != AArch64ISD::CSEL)
21617 return false;
21618 // Set the information about the operands.
21619 // TODO: we want the operands of the Cmp not the csel
21620 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
21621 SetCCInfo.IsAArch64 = true;
21622 SetCCInfo.Info.AArch64.CC =
21623 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
21624
21625 // Check that the operands matches the constraints:
21626 // (1) Both operands must be constants.
21627 // (2) One must be 1 and the other must be 0.
21628 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
21629 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
21630
21631 // Check (1).
21632 if (!TValue || !FValue)
21633 return false;
21634
21635 // Check (2).
21636 if (!TValue->isOne()) {
21637 // Update the comparison when we are interested in !cc.
21638 std::swap(TValue, FValue);
21639 SetCCInfo.Info.AArch64.CC =
21641 }
21642 return TValue->isOne() && FValue->isZero();
21643}
21644
21645// Returns true if Op is setcc or zext of setcc.
21647 if (isSetCC(Op, Info))
21648 return true;
21649 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
21650 isSetCC(Op->getOperand(0), Info));
21651}
21652
21653// The folding we want to perform is:
21654// (add x, [zext] (setcc cc ...) )
21655// -->
21656// (csel x, (add x, 1), !cc ...)
21657//
21658// The latter will get matched to a CSINC instruction.
21660 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
21661 SDValue LHS = Op->getOperand(0);
21662 SDValue RHS = Op->getOperand(1);
21663 SetCCInfoAndKind InfoAndKind;
21664
21665 // If both operands are a SET_CC, then we don't want to perform this
21666 // folding and create another csel as this results in more instructions
21667 // (and higher register usage).
21668 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
21669 isSetCCOrZExtSetCC(RHS, InfoAndKind))
21670 return SDValue();
21671
21672 // If neither operand is a SET_CC, give up.
21673 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
21674 std::swap(LHS, RHS);
21675 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
21676 return SDValue();
21677 }
21678
21679 // FIXME: This could be generatized to work for FP comparisons.
21680 EVT CmpVT = InfoAndKind.IsAArch64
21681 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
21682 : InfoAndKind.Info.Generic.Opnd0->getValueType();
21683 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
21684 return SDValue();
21685
21686 SDValue CCVal;
21687 SDValue Cmp;
21688 SDLoc DL(Op);
21689 if (InfoAndKind.IsAArch64) {
21690 CCVal = DAG.getConstant(
21692 MVT::i32);
21693 Cmp = *InfoAndKind.Info.AArch64.Cmp;
21694 } else
21695 Cmp = getAArch64Cmp(
21696 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
21697 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
21698 DL);
21699
21700 EVT VT = Op->getValueType(0);
21701 LHS = DAG.getNode(ISD::ADD, DL, VT, RHS, DAG.getConstant(1, DL, VT));
21702 return DAG.getNode(AArch64ISD::CSEL, DL, VT, RHS, LHS, CCVal, Cmp);
21703}
21704
21705// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
21707 EVT VT = N->getValueType(0);
21708 // Only scalar integer and vector types.
21709 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
21710 return SDValue();
21711
21712 SDValue LHS = N->getOperand(0);
21713 SDValue RHS = N->getOperand(1);
21714 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21715 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
21716 return SDValue();
21717
21718 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
21719 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
21720 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
21721 return SDValue();
21722
21723 SDValue Op1 = LHS->getOperand(0);
21724 SDValue Op2 = RHS->getOperand(0);
21725 EVT OpVT1 = Op1.getValueType();
21726 EVT OpVT2 = Op2.getValueType();
21727 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
21728 Op2.getOpcode() != AArch64ISD::UADDV ||
21729 OpVT1.getVectorElementType() != VT)
21730 return SDValue();
21731
21732 SDValue Val1 = Op1.getOperand(0);
21733 SDValue Val2 = Op2.getOperand(0);
21734 EVT ValVT = Val1->getValueType(0);
21735 SDLoc DL(N);
21736 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
21737 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
21738 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
21739 DAG.getConstant(0, DL, MVT::i64));
21740}
21741
21742/// Perform the scalar expression combine in the form of:
21743/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
21744/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
21746 EVT VT = N->getValueType(0);
21747 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
21748 return SDValue();
21749
21750 SDValue LHS = N->getOperand(0);
21751 SDValue RHS = N->getOperand(1);
21752
21753 // Handle commutivity.
21754 if (LHS.getOpcode() != AArch64ISD::CSEL &&
21755 LHS.getOpcode() != AArch64ISD::CSNEG) {
21756 std::swap(LHS, RHS);
21757 if (LHS.getOpcode() != AArch64ISD::CSEL &&
21758 LHS.getOpcode() != AArch64ISD::CSNEG) {
21759 return SDValue();
21760 }
21761 }
21762
21763 if (!LHS.hasOneUse())
21764 return SDValue();
21765
21767 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
21768
21769 // The CSEL should include a const one operand, and the CSNEG should include
21770 // One or NegOne operand.
21771 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
21772 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
21773 if (!CTVal || !CFVal)
21774 return SDValue();
21775
21776 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
21777 (CTVal->isOne() || CFVal->isOne())) &&
21778 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
21779 (CTVal->isOne() || CFVal->isAllOnes())))
21780 return SDValue();
21781
21782 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
21783 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
21784 !CFVal->isOne()) {
21785 std::swap(CTVal, CFVal);
21787 }
21788
21789 SDLoc DL(N);
21790 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
21791 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
21792 !CFVal->isAllOnes()) {
21793 APInt C = -1 * CFVal->getAPIntValue();
21794 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
21795 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
21797 }
21798
21799 // It might be neutral for larger constants, as the immediate need to be
21800 // materialized in a register.
21801 APInt ADDC = CTVal->getAPIntValue();
21802 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21803 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
21804 return SDValue();
21805
21806 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
21807 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
21808 "Unexpected constant value");
21809
21810 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
21811 SDValue CCVal = getCondCode(DAG, AArch64CC);
21812 SDValue Cmp = LHS.getOperand(3);
21813
21814 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
21815}
21816
21817// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
21819 EVT VT = N->getValueType(0);
21820 if (N->getOpcode() != ISD::ADD)
21821 return SDValue();
21822
21823 SDValue Dot = N->getOperand(0);
21824 SDValue A = N->getOperand(1);
21825 // Handle commutivity
21826 auto isZeroDot = [](SDValue Dot) {
21827 return (Dot.getOpcode() == AArch64ISD::UDOT ||
21828 Dot.getOpcode() == AArch64ISD::SDOT) &&
21830 };
21831 if (!isZeroDot(Dot))
21832 std::swap(Dot, A);
21833 if (!isZeroDot(Dot))
21834 return SDValue();
21835
21836 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
21837 Dot.getOperand(2));
21838}
21839
21841 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
21842}
21843
21844// Try to fold
21845//
21846// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
21847//
21848// The folding helps csel to be matched with csneg without generating
21849// redundant neg instruction, which includes negation of the csel expansion
21850// of abs node lowered by lowerABS.
21852 if (!isNegatedInteger(SDValue(N, 0)))
21853 return SDValue();
21854
21855 SDValue CSel = N->getOperand(1);
21856 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
21857 return SDValue();
21858
21859 SDValue N0 = CSel.getOperand(0);
21860 SDValue N1 = CSel.getOperand(1);
21861
21862 // If neither of them are negations, it's not worth the folding as it
21863 // introduces two additional negations while reducing one negation.
21864 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
21865 return SDValue();
21866
21867 SDLoc DL(N);
21868 EVT VT = CSel.getValueType();
21869
21870 SDValue N0N = DAG.getNegative(N0, DL, VT);
21871 SDValue N1N = DAG.getNegative(N1, DL, VT);
21872
21873 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
21874 CSel.getOperand(3));
21875}
21876
21877// The basic add/sub long vector instructions have variants with "2" on the end
21878// which act on the high-half of their inputs. They are normally matched by
21879// patterns like:
21880//
21881// (add (zeroext (extract_high LHS)),
21882// (zeroext (extract_high RHS)))
21883// -> uaddl2 vD, vN, vM
21884//
21885// However, if one of the extracts is something like a duplicate, this
21886// instruction can still be used profitably. This function puts the DAG into a
21887// more appropriate form for those patterns to trigger.
21890 SelectionDAG &DAG = DCI.DAG;
21891 if (DCI.isBeforeLegalizeOps())
21892 return SDValue();
21893
21894 MVT VT = N->getSimpleValueType(0);
21895 if (!VT.is128BitVector()) {
21896 if (N->getOpcode() == ISD::ADD)
21897 return performSetccAddFolding(N, DAG);
21898 return SDValue();
21899 }
21900
21901 // Make sure both branches are extended in the same way.
21902 SDValue LHS = N->getOperand(0);
21903 SDValue RHS = N->getOperand(1);
21904 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
21905 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
21906 LHS.getOpcode() != RHS.getOpcode())
21907 return SDValue();
21908
21909 unsigned ExtType = LHS.getOpcode();
21910
21911 // It's not worth doing if at least one of the inputs isn't already an
21912 // extract, but we don't know which it'll be so we have to try both.
21913 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
21914 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
21915 if (!RHS.getNode())
21916 return SDValue();
21917
21918 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
21919 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
21920 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
21921 if (!LHS.getNode())
21922 return SDValue();
21923
21924 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
21925 }
21926
21927 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
21928}
21929
21930static bool isCMP(SDValue Op) {
21931 return Op.getOpcode() == AArch64ISD::SUBS &&
21932 !Op.getNode()->hasAnyUseOfValue(0);
21933}
21934
21935// (CSEL 1 0 CC Cond) => CC
21936// (CSEL 0 1 CC Cond) => !CC
21937static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
21938 if (Op.getOpcode() != AArch64ISD::CSEL)
21939 return std::nullopt;
21940 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
21941 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
21942 return std::nullopt;
21943 SDValue OpLHS = Op.getOperand(0);
21944 SDValue OpRHS = Op.getOperand(1);
21945 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
21946 return CC;
21947 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
21948 return getInvertedCondCode(CC);
21949
21950 return std::nullopt;
21951}
21952
21953// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
21954// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
21955static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
21956 SDValue CmpOp = Op->getOperand(2);
21957 if (!isCMP(CmpOp))
21958 return SDValue();
21959
21960 if (IsAdd) {
21961 if (!isOneConstant(CmpOp.getOperand(1)))
21962 return SDValue();
21963 } else {
21964 if (!isNullConstant(CmpOp.getOperand(0)))
21965 return SDValue();
21966 }
21967
21968 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
21969 auto CC = getCSETCondCode(CsetOp);
21970 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
21971 return SDValue();
21972
21973 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
21974 Op->getOperand(0), Op->getOperand(1),
21975 CsetOp.getOperand(3));
21976}
21977
21978// (ADC x 0 cond) => (CINC x HS cond)
21980 SDValue LHS = N->getOperand(0);
21981 SDValue RHS = N->getOperand(1);
21982 SDValue Cond = N->getOperand(2);
21983
21984 if (!isNullConstant(RHS))
21985 return SDValue();
21986
21987 EVT VT = N->getValueType(0);
21988 SDLoc DL(N);
21989
21990 // (CINC x cc cond) <=> (CSINC x x !cc cond)
21992 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
21993}
21994
21997 SelectionDAG &DAG) {
21998 SDLoc DL(N);
21999 EVT VT = N->getValueType(0);
22000
22002 (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
22003 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
22004 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
22005 if (Elt0->getOpcode() == ISD::FP_ROUND &&
22006 Elt1->getOpcode() == ISD::FP_ROUND &&
22007 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
22008 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
22009 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
22011 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22012 // Constant index.
22014 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
22015 Elt0->getOperand(0)->getOperand(0) ==
22016 Elt1->getOperand(0)->getOperand(0) &&
22017 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
22018 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
22019 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
22020 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
22021 SDValue HighLanes;
22022 if (Elt2->isUndef() && Elt3->isUndef()) {
22023 HighLanes = DAG.getPOISON(MVT::v2f32);
22024 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
22025 Elt3->getOpcode() == ISD::FP_ROUND &&
22026 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
22027 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
22028 Elt2->getConstantOperandVal(1) ==
22029 Elt3->getConstantOperandVal(1) &&
22030 Elt2->getOperand(0)->getOpcode() ==
22032 Elt3->getOperand(0)->getOpcode() ==
22034 // Constant index.
22035 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
22036 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
22037 Elt2->getOperand(0)->getOperand(0) ==
22038 Elt3->getOperand(0)->getOperand(0) &&
22039 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
22040 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
22041 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
22042 HighLanes =
22043 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
22044 }
22045 if (HighLanes) {
22046 SDValue DoubleToSingleSticky =
22047 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
22048 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
22049 DoubleToSingleSticky, HighLanes);
22050 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
22051 Elt0->getOperand(1));
22052 }
22053 }
22054 }
22055 }
22056
22057 if (VT == MVT::v2f64) {
22058 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
22059 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
22060 Elt1->getOpcode() == ISD::FP_EXTEND &&
22062 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22063 Elt0->getOperand(0)->getOperand(0) ==
22064 Elt1->getOperand(0)->getOperand(0) &&
22065 // Constant index.
22067 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
22068 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
22069 Elt1->getOperand(0)->getConstantOperandVal(1) &&
22070 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
22071 // ResultType's known minimum vector length.
22072 Elt0->getOperand(0)->getConstantOperandVal(1) %
22074 0) {
22075 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
22076 if (SrcVec.getValueType() == MVT::v4f16 ||
22077 SrcVec.getValueType() == MVT::v4bf16) {
22078 SDValue HalfToSingle =
22079 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
22080 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
22081 SDValue Extract = DAG.getNode(
22083 HalfToSingle, SubvectorIdx);
22084 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
22085 }
22086 }
22087 }
22088
22089 // A build vector of two extracted elements is equivalent to an
22090 // extract subvector where the inner vector is any-extended to the
22091 // extract_vector_elt VT.
22092 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
22093 // (extract_elt_iXX_to_i32 vec Idx+1))
22094 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
22095
22096 // For now, only consider the v2i32 case, which arises as a result of
22097 // legalization.
22098 if (VT != MVT::v2i32)
22099 return SDValue();
22100
22101 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
22102 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
22103 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22104 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22105 // Constant index.
22106 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
22107 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
22108 // Both EXTRACT_VECTOR_ELT from same vector...
22109 Elt0->getOperand(0) == Elt1->getOperand(0) &&
22110 // ... and contiguous. First element's index +1 == second element's index.
22111 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
22112 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
22113 // ResultType's known minimum vector length.
22114 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
22115 SDValue VecToExtend = Elt0->getOperand(0);
22116 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
22117 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
22118 return SDValue();
22119
22120 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
22121
22122 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
22123 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
22124 SubvectorIdx);
22125 }
22126
22127 return SDValue();
22128}
22129
22130// A special combine for the sqdmulh family of instructions.
22131// smin( sra ( mul( sext v0, sext v1 ) ), SHIFT_AMOUNT ),
22132// SATURATING_VAL ) can be reduced to sqdmulh(...)
22134
22135 if (N->getOpcode() != ISD::SMIN)
22136 return SDValue();
22137
22138 EVT DestVT = N->getValueType(0);
22139
22140 if (!DestVT.isVector() || DestVT.getScalarSizeInBits() > 64 ||
22141 DestVT.isScalableVector())
22142 return SDValue();
22143
22144 ConstantSDNode *Clamp = isConstOrConstSplat(N->getOperand(1));
22145
22146 if (!Clamp)
22147 return SDValue();
22148
22149 MVT ScalarType;
22150 unsigned ShiftAmt = 0;
22151 switch (Clamp->getSExtValue()) {
22152 case (1ULL << 15) - 1:
22153 ScalarType = MVT::i16;
22154 ShiftAmt = 16;
22155 break;
22156 case (1ULL << 31) - 1:
22157 ScalarType = MVT::i32;
22158 ShiftAmt = 32;
22159 break;
22160 default:
22161 return SDValue();
22162 }
22163
22164 SDValue Sra = N->getOperand(0);
22165 if (Sra.getOpcode() != ISD::SRA || !Sra.hasOneUse())
22166 return SDValue();
22167
22168 ConstantSDNode *RightShiftVec = isConstOrConstSplat(Sra.getOperand(1));
22169 if (!RightShiftVec)
22170 return SDValue();
22171 unsigned SExtValue = RightShiftVec->getSExtValue();
22172
22173 if (SExtValue != (ShiftAmt - 1))
22174 return SDValue();
22175
22176 SDValue Mul = Sra.getOperand(0);
22177 if (Mul.getOpcode() != ISD::MUL)
22178 return SDValue();
22179
22180 SDValue SExt0 = Mul.getOperand(0);
22181 SDValue SExt1 = Mul.getOperand(1);
22182
22183 if (SExt0.getOpcode() != ISD::SIGN_EXTEND ||
22184 SExt1.getOpcode() != ISD::SIGN_EXTEND)
22185 return SDValue();
22186
22187 EVT SExt0Type = SExt0.getOperand(0).getValueType();
22188 EVT SExt1Type = SExt1.getOperand(0).getValueType();
22189
22190 if (SExt0Type != SExt1Type || SExt0Type.getScalarType() != ScalarType ||
22191 SExt0Type.getFixedSizeInBits() > 128 || !SExt0Type.isPow2VectorType() ||
22192 SExt0Type.getVectorNumElements() == 1)
22193 return SDValue();
22194
22195 SDLoc DL(N);
22196 SDValue V0 = SExt0.getOperand(0);
22197 SDValue V1 = SExt1.getOperand(0);
22198
22199 // Ensure input vectors are extended to legal types
22200 if (SExt0Type.getFixedSizeInBits() < 64) {
22201 unsigned VecNumElements = SExt0Type.getVectorNumElements();
22202 EVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(64 / VecNumElements),
22203 VecNumElements);
22204 V0 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V0);
22205 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V1);
22206 }
22207
22208 SDValue SQDMULH =
22209 DAG.getNode(AArch64ISD::SQDMULH, DL, V0.getValueType(), V0, V1);
22210
22211 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, SQDMULH);
22212}
22213
22215 if (SDValue V = trySQDMULHCombine(N, DAG)) {
22216 return V;
22217 }
22218
22219 return SDValue();
22220}
22221
22224 SDLoc DL(N);
22225 EVT VT = N->getValueType(0);
22226 SDValue N0 = N->getOperand(0);
22227 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
22228 N0.getOpcode() == AArch64ISD::DUP) {
22229 SDValue Op = N0.getOperand(0);
22230 if (VT.getScalarType() == MVT::i32 &&
22231 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
22232 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
22233 return DAG.getNode(N0.getOpcode(), DL, VT, Op);
22234 }
22235
22236 // Performing the following combine produces a preferable form for ISEL.
22237 // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
22239 N0.hasOneUse()) {
22240 SDValue Op = N0.getOperand(0);
22241 SDValue ExtractIndexNode = N0.getOperand(1);
22242 if (!isa<ConstantSDNode>(ExtractIndexNode))
22243 return SDValue();
22244
22245 // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
22246 // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
22247 assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
22248 "Unexpected legalisation result!");
22249
22250 EVT SrcVectorType = Op.getValueType();
22251 // We also assume that SrcVectorType cannot be a V64 (see
22252 // LowerEXTRACT_VECTOR_ELT).
22253 assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
22254 "Unexpected legalisation result!");
22255
22256 unsigned ExtractIndex =
22257 cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
22258 MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
22259
22260 Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
22261 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
22262 DAG.getVectorIdxConstant(ExtractIndex * 2, DL));
22263 }
22264
22265 return SDValue();
22266}
22267
22268// Check an node is an extend or shift operand
22270 unsigned Opcode = N.getOpcode();
22271 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
22272 EVT SrcVT;
22273 if (Opcode == ISD::SIGN_EXTEND_INREG)
22274 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
22275 else
22276 SrcVT = N.getOperand(0).getValueType();
22277
22278 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
22279 } else if (Opcode == ISD::AND) {
22280 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
22281 if (!CSD)
22282 return false;
22283 uint64_t AndMask = CSD->getZExtValue();
22284 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
22285 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
22286 return isa<ConstantSDNode>(N.getOperand(1));
22287 }
22288
22289 return false;
22290}
22291
22292// (N - Y) + Z --> (Z - Y) + N
22293// when N is an extend or shift operand
22295 SelectionDAG &DAG) {
22296 auto IsOneUseExtend = [](SDValue N) {
22297 return N.hasOneUse() && isExtendOrShiftOperand(N);
22298 };
22299
22300 // DAGCombiner will revert the combination when Z is constant cause
22301 // dead loop. So don't enable the combination when Z is constant.
22302 // If Z is one use shift C, we also can't do the optimization.
22303 // It will falling to self infinite loop.
22304 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
22305 return SDValue();
22306
22307 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
22308 return SDValue();
22309
22310 SDValue Shift = SUB.getOperand(0);
22311 if (!IsOneUseExtend(Shift))
22312 return SDValue();
22313
22314 SDLoc DL(N);
22315 EVT VT = N->getValueType(0);
22316
22317 SDValue Y = SUB.getOperand(1);
22318 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
22319 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
22320}
22321
22323 SelectionDAG &DAG) {
22324 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
22325 // commutative.
22326 if (N->getOpcode() != ISD::ADD)
22327 return SDValue();
22328
22329 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
22330 // shifted register is only available for i32 and i64.
22331 EVT VT = N->getValueType(0);
22332 if (VT != MVT::i32 && VT != MVT::i64)
22333 return SDValue();
22334
22335 SDLoc DL(N);
22336 SDValue LHS = N->getOperand(0);
22337 SDValue RHS = N->getOperand(1);
22338
22339 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
22340 return Val;
22341 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
22342 return Val;
22343
22344 uint64_t LHSImm = 0, RHSImm = 0;
22345 // If both operand are shifted by imm and shift amount is not greater than 4
22346 // for one operand, swap LHS and RHS to put operand with smaller shift amount
22347 // on RHS.
22348 //
22349 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
22350 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
22351 // with LSL (shift > 4). For the rest of processors, this is no-op for
22352 // performance or correctness.
22353 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
22354 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
22355 RHSImm > 4 && LHS.hasOneUse())
22356 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
22357
22358 return SDValue();
22359}
22360
22361// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
22362// This reassociates it back to allow the creation of more mls instructions.
22364 if (N->getOpcode() != ISD::SUB)
22365 return SDValue();
22366
22367 SDValue Add = N->getOperand(1);
22368 SDValue X = N->getOperand(0);
22369 if (Add.getOpcode() != ISD::ADD)
22370 return SDValue();
22371
22372 if (!Add.hasOneUse())
22373 return SDValue();
22375 return SDValue();
22376
22377 SDValue M1 = Add.getOperand(0);
22378 SDValue M2 = Add.getOperand(1);
22379 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
22380 M1.getOpcode() != AArch64ISD::UMULL)
22381 return SDValue();
22382 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
22383 M2.getOpcode() != AArch64ISD::UMULL)
22384 return SDValue();
22385
22386 EVT VT = N->getValueType(0);
22387 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
22388 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
22389}
22390
22391// Combine into mla/mls.
22392// This works on the patterns of:
22393// add v1, (mul v2, v3)
22394// sub v1, (mul v2, v3)
22395// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
22396// It will transform the add/sub to a scalable version, so that we can
22397// make use of SVE's MLA/MLS that will be generated for that pattern
22398static SDValue
22400 SelectionDAG &DAG = DCI.DAG;
22401 // Make sure that the types are legal
22402 if (!DCI.isAfterLegalizeDAG())
22403 return SDValue();
22404 // Before using SVE's features, check first if it's available.
22405 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
22406 return SDValue();
22407
22408 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
22409 return SDValue();
22410
22411 if (!N->getValueType(0).isFixedLengthVector())
22412 return SDValue();
22413
22414 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
22415 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
22416 return SDValue();
22417
22418 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
22419 return SDValue();
22420
22421 SDValue MulValue = Op1->getOperand(0);
22422 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
22423 return SDValue();
22424
22425 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
22426 return SDValue();
22427
22428 EVT ScalableVT = MulValue.getValueType();
22429 if (!ScalableVT.isScalableVector())
22430 return SDValue();
22431
22432 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
22433 SDValue NewValue =
22434 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
22435 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
22436 };
22437
22438 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
22439 return res;
22440 else if (N->getOpcode() == ISD::ADD)
22441 return performOpt(N->getOperand(1), N->getOperand(0));
22442
22443 return SDValue();
22444}
22445
22446// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
22447// help, for example, to produce ssra from sshr+add.
22449 EVT VT = N->getValueType(0);
22450 if (VT != MVT::i64 ||
22451 DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64))
22452 return SDValue();
22453 SDValue Op0 = N->getOperand(0);
22454 SDValue Op1 = N->getOperand(1);
22455
22456 // At least one of the operands should be an extract, and the other should be
22457 // something that is easy to convert to v1i64 type (in this case a load).
22458 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
22459 Op0.getOpcode() != ISD::LOAD)
22460 return SDValue();
22461 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
22462 Op1.getOpcode() != ISD::LOAD)
22463 return SDValue();
22464
22465 SDLoc DL(N);
22466 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22467 Op0.getOperand(0).getValueType() == MVT::v1i64) {
22468 Op0 = Op0.getOperand(0);
22469 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
22470 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22471 Op1.getOperand(0).getValueType() == MVT::v1i64) {
22472 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
22473 Op1 = Op1.getOperand(0);
22474 } else
22475 return SDValue();
22476
22477 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
22478 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
22479 DAG.getConstant(0, DL, MVT::i64));
22480}
22481
22484 if (!BV->hasOneUse())
22485 return false;
22486 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
22487 if (!Ld || !Ld->isSimple())
22488 return false;
22489 Loads.push_back(Ld);
22490 return true;
22491 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
22493 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
22494 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
22495 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
22496 return false;
22497 Loads.push_back(Ld);
22498 }
22499 return true;
22500 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
22501 // Try to find a tree of shuffles and concats from how IR shuffles of loads
22502 // are lowered. Note that this only comes up because we do not always visit
22503 // operands before uses. After that is fixed this can be removed and in the
22504 // meantime this is fairly specific to the lowering we expect from IR.
22505 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
22506 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
22507 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
22508 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
22509 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
22510 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
22511 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
22512 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
22513 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
22514 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
22515 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
22516 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
22517 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
22518 B.getOperand(1).getNumOperands() != 4)
22519 return false;
22520 auto SV1 = cast<ShuffleVectorSDNode>(B);
22521 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
22522 int NumElts = B.getValueType().getVectorNumElements();
22523 int NumSubElts = NumElts / 4;
22524 for (int I = 0; I < NumSubElts; I++) {
22525 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
22526 if (SV1->getMaskElt(I) != I ||
22527 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
22528 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
22529 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
22530 return false;
22531 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
22532 if (SV2->getMaskElt(I) != I ||
22533 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
22534 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
22535 return false;
22536 }
22537 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
22538 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
22539 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
22540 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
22541 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
22542 !Ld2->isSimple() || !Ld3->isSimple())
22543 return false;
22544 Loads.push_back(Ld0);
22545 Loads.push_back(Ld1);
22546 Loads.push_back(Ld2);
22547 Loads.push_back(Ld3);
22548 return true;
22549 }
22550 return false;
22551}
22552
22554 SelectionDAG &DAG,
22555 unsigned &NumSubLoads) {
22556 if (!Op0.hasOneUse() || !Op1.hasOneUse())
22557 return false;
22558
22559 SmallVector<LoadSDNode *> Loads0, Loads1;
22560 if (isLoadOrMultipleLoads(Op0, Loads0) &&
22561 isLoadOrMultipleLoads(Op1, Loads1)) {
22562 if (NumSubLoads && Loads0.size() != NumSubLoads)
22563 return false;
22564 NumSubLoads = Loads0.size();
22565 return Loads0.size() == Loads1.size() &&
22566 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
22567 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
22568 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
22570 Size / 8, 1);
22571 });
22572 }
22573
22574 if (Op0.getOpcode() != Op1.getOpcode())
22575 return false;
22576
22577 switch (Op0.getOpcode()) {
22578 case ISD::ADD:
22579 case ISD::SUB:
22581 DAG, NumSubLoads) &&
22583 DAG, NumSubLoads);
22584 case ISD::SIGN_EXTEND:
22585 case ISD::ANY_EXTEND:
22586 case ISD::ZERO_EXTEND:
22587 EVT XVT = Op0.getOperand(0).getValueType();
22588 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
22589 XVT.getScalarSizeInBits() != 32)
22590 return false;
22592 DAG, NumSubLoads);
22593 }
22594 return false;
22595}
22596
22597// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
22598// into a single load of twice the size, that we extract the bottom part and top
22599// part so that the shl can use a shll2 instruction. The two loads in that
22600// example can also be larger trees of instructions, which are identical except
22601// for the leaves which are all loads offset from the LHS, including
22602// buildvectors of multiple loads. For example the RHS tree could be
22603// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
22604// Whilst it can be common for the larger loads to replace LDP instructions
22605// (which doesn't gain anything on it's own), the larger loads can help create
22606// more efficient code, and in buildvectors prevent the need for ld1 lane
22607// inserts which can be slower than normal loads.
22609 EVT VT = N->getValueType(0);
22610 if (!VT.isFixedLengthVector() ||
22611 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
22612 VT.getScalarSizeInBits() != 64))
22613 return SDValue();
22614
22615 SDValue Other = N->getOperand(0);
22616 SDValue Shift = N->getOperand(1);
22617 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
22618 std::swap(Shift, Other);
22619 APInt ShiftAmt;
22620 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
22621 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
22622 return SDValue();
22623
22624 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
22625 !ISD::isExtOpcode(Other.getOpcode()) ||
22626 Shift.getOperand(0).getOperand(0).getValueType() !=
22627 Other.getOperand(0).getValueType() ||
22628 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
22629 return SDValue();
22630
22631 SDValue Op0 = Other.getOperand(0);
22632 SDValue Op1 = Shift.getOperand(0).getOperand(0);
22633
22634 unsigned NumSubLoads = 0;
22635 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
22636 return SDValue();
22637
22638 // Attempt to rule out some unprofitable cases using heuristics (some working
22639 // around suboptimal code generation), notably if the extend not be able to
22640 // use ushll2 instructions as the types are not large enough. Otherwise zip's
22641 // will need to be created which can increase the instruction count.
22642 unsigned NumElts = Op0.getValueType().getVectorNumElements();
22643 unsigned NumSubElts = NumElts / NumSubLoads;
22644 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
22645 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
22646 Op0.getValueType().getSizeInBits() < 128 &&
22648 return SDValue();
22649
22650 // Recreate the tree with the new combined loads.
22651 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
22652 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
22653 EVT DVT =
22655
22656 SmallVector<LoadSDNode *> Loads0, Loads1;
22657 if (isLoadOrMultipleLoads(Op0, Loads0) &&
22658 isLoadOrMultipleLoads(Op1, Loads1)) {
22659 EVT LoadVT = EVT::getVectorVT(
22660 *DAG.getContext(), Op0.getValueType().getScalarType(),
22661 Op0.getValueType().getVectorNumElements() / Loads0.size());
22662 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
22663
22664 SmallVector<SDValue> NewLoads;
22665 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
22666 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
22667 L0->getBasePtr(), L0->getPointerInfo(),
22668 L0->getBaseAlign());
22669 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
22670 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
22671 NewLoads.push_back(Load);
22672 }
22673 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
22674 }
22675
22677 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
22678 Ops.push_back(GenCombinedTree(O0, O1, DAG));
22679 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
22680 };
22681 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
22682
22683 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
22684 int Hi = NumSubElts, Lo = 0;
22685 for (unsigned i = 0; i < NumSubLoads; i++) {
22686 for (unsigned j = 0; j < NumSubElts; j++) {
22687 LowMask[i * NumSubElts + j] = Lo++;
22688 HighMask[i * NumSubElts + j] = Hi++;
22689 }
22690 Lo += NumSubElts;
22691 Hi += NumSubElts;
22692 }
22693 SDLoc DL(N);
22694 SDValue Ext0, Ext1;
22695 // Extract the top and bottom lanes, then extend the result. Possibly extend
22696 // the result then extract the lanes if the two operands match as it produces
22697 // slightly smaller code.
22698 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
22700 NewOp, DAG.getConstant(0, DL, MVT::i64));
22701 SDValue SubH =
22702 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
22703 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
22704 SDValue Extr0 =
22705 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
22706 SDValue Extr1 =
22707 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
22708 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
22709 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
22710 } else {
22712 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
22713 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
22714 DAG.getConstant(0, DL, MVT::i64));
22715 SDValue SubH =
22716 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
22717 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
22718 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
22719 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
22720 }
22721 SDValue NShift =
22722 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
22723 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
22724}
22725
22726// Attempt to combine the following patterns:
22727// SUB x, (CSET LO, (CMP a, b)) -> SBC x, 0, (CMP a, b)
22728// SUB (SUB x, y), (CSET LO, (CMP a, b)) -> SBC x, y, (CMP a, b)
22729// The CSET may be preceded by a ZEXT.
22731 if (N->getOpcode() != ISD::SUB)
22732 return SDValue();
22733
22734 EVT VT = N->getValueType(0);
22735 if (VT != MVT::i32 && VT != MVT::i64)
22736 return SDValue();
22737
22738 SDValue N1 = N->getOperand(1);
22739 if (N1.getOpcode() == ISD::ZERO_EXTEND && N1.hasOneUse())
22740 N1 = N1.getOperand(0);
22741 if (!N1.hasOneUse() || getCSETCondCode(N1) != AArch64CC::LO)
22742 return SDValue();
22743
22744 SDValue Flags = N1.getOperand(3);
22745 if (Flags.getOpcode() != AArch64ISD::SUBS)
22746 return SDValue();
22747
22748 SDLoc DL(N);
22749 SDValue N0 = N->getOperand(0);
22750 if (N0->getOpcode() == ISD::SUB)
22751 return DAG.getNode(AArch64ISD::SBC, DL, VT, N0.getOperand(0),
22752 N0.getOperand(1), Flags);
22753 return DAG.getNode(AArch64ISD::SBC, DL, VT, N0, DAG.getConstant(0, DL, VT),
22754 Flags);
22755}
22756
22757// add(trunc(ashr(A, C)), trunc(lshr(A, BW-1))), with C >= BW
22758// ->
22759// X = trunc(ashr(A, C)); add(x, lshr(X, BW-1)
22760// The original converts into ashr+lshr+xtn+xtn+add. The second becomes
22761// ashr+xtn+usra. The first form has less total latency due to more parallelism,
22762// but more micro-ops and seems to be slower in practice.
22764 using namespace llvm::SDPatternMatch;
22765 EVT VT = N->getValueType(0);
22766 if (VT != MVT::v2i32 && VT != MVT::v4i16 && VT != MVT::v8i8)
22767 return SDValue();
22768
22769 SDValue AShr, LShr;
22770 if (!sd_match(N, m_Add(m_Trunc(m_Value(AShr)), m_Trunc(m_Value(LShr)))))
22771 return SDValue();
22772 if (AShr.getOpcode() != AArch64ISD::VASHR)
22773 std::swap(AShr, LShr);
22774 if (AShr.getOpcode() != AArch64ISD::VASHR ||
22775 LShr.getOpcode() != AArch64ISD::VLSHR ||
22776 AShr.getOperand(0) != LShr.getOperand(0) ||
22778 LShr.getConstantOperandVal(1) != VT.getScalarSizeInBits() * 2 - 1)
22779 return SDValue();
22780
22781 SDLoc DL(N);
22782 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, AShr);
22783 SDValue Shift = DAG.getNode(
22784 AArch64ISD::VLSHR, DL, VT, Trunc,
22785 DAG.getTargetConstant(VT.getScalarSizeInBits() - 1, DL, MVT::i32));
22786 return DAG.getNode(ISD::ADD, DL, VT, Trunc, Shift);
22787}
22788
22791 // Try to change sum of two reductions.
22792 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
22793 return Val;
22794 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
22795 return Val;
22796 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
22797 return Val;
22798 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
22799 return Val;
22800 if (SDValue Val = performVectorExtCombine(N, DCI.DAG))
22801 return Val;
22803 return Val;
22804 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
22805 return Val;
22806 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
22807 return Val;
22808 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
22809 return Val;
22810 if (SDValue Val = performSubWithBorrowCombine(N, DCI.DAG))
22811 return Val;
22812 if (SDValue Val = performAddTruncShiftCombine(N, DCI.DAG))
22813 return Val;
22814
22815 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
22816 return Val;
22817
22818 return performAddSubLongCombine(N, DCI);
22819}
22820
22821// Massage DAGs which we can use the high-half "long" operations on into
22822// something isel will recognize better. E.g.
22823//
22824// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
22825// (aarch64_neon_umull (extract_high (v2i64 vec)))
22826// (extract_high (v2i64 (dup128 scalar)))))
22827//
22830 SelectionDAG &DAG) {
22831 if (DCI.isBeforeLegalizeOps())
22832 return SDValue();
22833
22834 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
22835 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
22836 assert(LHS.getValueType().is64BitVector() &&
22837 RHS.getValueType().is64BitVector() &&
22838 "unexpected shape for long operation");
22839
22840 // Either node could be a DUP, but it's not worth doing both of them (you'd
22841 // just as well use the non-high version) so look for a corresponding extract
22842 // operation on the other "wing".
22845 if (!RHS.getNode())
22846 return SDValue();
22849 if (!LHS.getNode())
22850 return SDValue();
22851 } else
22852 return SDValue();
22853
22854 if (IID == Intrinsic::not_intrinsic)
22855 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
22856
22857 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
22858 N->getOperand(0), LHS, RHS);
22859}
22860
22861static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
22862 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
22863 unsigned ElemBits = ElemTy.getSizeInBits();
22864
22865 int64_t ShiftAmount;
22866 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
22867 APInt SplatValue, SplatUndef;
22868 unsigned SplatBitSize;
22869 bool HasAnyUndefs;
22870 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
22871 HasAnyUndefs, ElemBits) ||
22872 SplatBitSize != ElemBits)
22873 return SDValue();
22874
22875 ShiftAmount = SplatValue.getSExtValue();
22876 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
22877 ShiftAmount = CVN->getSExtValue();
22878 } else
22879 return SDValue();
22880
22881 // If the shift amount is zero, remove the shift intrinsic.
22882 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
22883 return N->getOperand(1);
22884
22885 unsigned Opcode;
22886 bool IsRightShift;
22887 switch (IID) {
22888 default:
22889 llvm_unreachable("Unknown shift intrinsic");
22890 case Intrinsic::aarch64_neon_sqshl:
22891 Opcode = AArch64ISD::SQSHL_I;
22892 IsRightShift = false;
22893 break;
22894 case Intrinsic::aarch64_neon_uqshl:
22895 Opcode = AArch64ISD::UQSHL_I;
22896 IsRightShift = false;
22897 break;
22898 case Intrinsic::aarch64_neon_srshl:
22899 Opcode = AArch64ISD::SRSHR_I;
22900 IsRightShift = true;
22901 break;
22902 case Intrinsic::aarch64_neon_urshl:
22903 Opcode = AArch64ISD::URSHR_I;
22904 IsRightShift = true;
22905 break;
22906 case Intrinsic::aarch64_neon_sqshlu:
22907 Opcode = AArch64ISD::SQSHLU_I;
22908 IsRightShift = false;
22909 break;
22910 case Intrinsic::aarch64_neon_sshl:
22911 case Intrinsic::aarch64_neon_ushl:
22912 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
22913 // left shift for positive shift amounts. For negative shifts we can use a
22914 // VASHR/VLSHR as appropriate.
22915 if (ShiftAmount < 0) {
22916 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
22917 : AArch64ISD::VLSHR;
22918 ShiftAmount = -ShiftAmount;
22919 } else
22920 Opcode = AArch64ISD::VSHL;
22921 IsRightShift = false;
22922 break;
22923 }
22924
22925 EVT VT = N->getValueType(0);
22926 SDValue Op = N->getOperand(1);
22927 SDLoc DL(N);
22928 if (VT == MVT::i64) {
22929 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op);
22930 VT = MVT::v1i64;
22931 }
22932
22933 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
22934 Op = DAG.getNode(Opcode, DL, VT, Op,
22935 DAG.getSignedConstant(-ShiftAmount, DL, MVT::i32, true));
22936 if (N->getValueType(0) == MVT::i64)
22937 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
22938 DAG.getConstant(0, DL, MVT::i64));
22939 return Op;
22940 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
22941 Op = DAG.getNode(Opcode, DL, VT, Op,
22942 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
22943 if (N->getValueType(0) == MVT::i64)
22944 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
22945 DAG.getConstant(0, DL, MVT::i64));
22946 return Op;
22947 }
22948
22949 return SDValue();
22950}
22951
22952// The CRC32[BH] instructions ignore the high bits of their data operand. Since
22953// the intrinsics must be legal and take an i32, this means there's almost
22954// certainly going to be a zext in the DAG which we can eliminate.
22955static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
22956 SDValue AndN = N->getOperand(2);
22957 if (AndN.getOpcode() != ISD::AND)
22958 return SDValue();
22959
22961 if (!CMask || CMask->getZExtValue() != Mask)
22962 return SDValue();
22963
22964 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
22965 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
22966}
22967
22969 SelectionDAG &DAG) {
22970 SDLoc DL(N);
22971 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
22972 DAG.getNode(Opc, DL, N->getOperand(1).getSimpleValueType(),
22973 N->getOperand(1)),
22974 DAG.getConstant(0, DL, MVT::i64));
22975}
22976
22978 SDLoc DL(N);
22979 SDValue Op1 = N->getOperand(1);
22980 SDValue Op2 = N->getOperand(2);
22981 EVT ScalarTy = Op2.getValueType();
22982 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
22983 ScalarTy = MVT::i32;
22984
22985 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
22986 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
22987 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
22988 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
22989 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
22990 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
22991}
22992
22994 SDLoc DL(N);
22995 SDValue Scalar = N->getOperand(3);
22996 EVT ScalarTy = Scalar.getValueType();
22997
22998 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
22999 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
23000
23001 SDValue Passthru = N->getOperand(1);
23002 SDValue Pred = N->getOperand(2);
23003 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, DL, N->getValueType(0),
23004 Pred, Scalar, Passthru);
23005}
23006
23008 SDLoc DL(N);
23009 LLVMContext &Ctx = *DAG.getContext();
23010 EVT VT = N->getValueType(0);
23011
23012 assert(VT.isScalableVector() && "Expected a scalable vector.");
23013
23014 // Current lowering only supports the SVE-ACLE types.
23016 return SDValue();
23017
23018 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
23019 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
23020 EVT ByteVT =
23021 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
23022
23023 // Convert everything to the domain of EXT (i.e bytes).
23024 SDValue Op0 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(1));
23025 SDValue Op1 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(2));
23026 SDValue Op2 = DAG.getNode(ISD::MUL, DL, MVT::i32, N->getOperand(3),
23027 DAG.getConstant(ElemSize, DL, MVT::i32));
23028
23029 SDValue EXT = DAG.getNode(AArch64ISD::EXT, DL, ByteVT, Op0, Op1, Op2);
23030 return DAG.getNode(ISD::BITCAST, DL, VT, EXT);
23031}
23032
23035 SelectionDAG &DAG) {
23036 if (DCI.isBeforeLegalize())
23037 return SDValue();
23038
23039 SDValue Comparator = N->getOperand(3);
23040 if (Comparator.getOpcode() == AArch64ISD::DUP ||
23041 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
23042 unsigned IID = getIntrinsicID(N);
23043 EVT VT = N->getValueType(0);
23044 EVT CmpVT = N->getOperand(2).getValueType();
23045 SDValue Pred = N->getOperand(1);
23046 SDValue Imm;
23047 SDLoc DL(N);
23048
23049 switch (IID) {
23050 default:
23051 llvm_unreachable("Called with wrong intrinsic!");
23052 break;
23053
23054 // Signed comparisons
23055 case Intrinsic::aarch64_sve_cmpeq_wide:
23056 case Intrinsic::aarch64_sve_cmpne_wide:
23057 case Intrinsic::aarch64_sve_cmpge_wide:
23058 case Intrinsic::aarch64_sve_cmpgt_wide:
23059 case Intrinsic::aarch64_sve_cmplt_wide:
23060 case Intrinsic::aarch64_sve_cmple_wide: {
23061 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
23062 int64_t ImmVal = CN->getSExtValue();
23063 if (ImmVal >= -16 && ImmVal <= 15)
23064 Imm = DAG.getSignedConstant(ImmVal, DL, MVT::i32);
23065 else
23066 return SDValue();
23067 }
23068 break;
23069 }
23070 // Unsigned comparisons
23071 case Intrinsic::aarch64_sve_cmphs_wide:
23072 case Intrinsic::aarch64_sve_cmphi_wide:
23073 case Intrinsic::aarch64_sve_cmplo_wide:
23074 case Intrinsic::aarch64_sve_cmpls_wide: {
23075 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
23076 uint64_t ImmVal = CN->getZExtValue();
23077 if (ImmVal <= 127)
23078 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
23079 else
23080 return SDValue();
23081 }
23082 break;
23083 }
23084 }
23085
23086 if (!Imm)
23087 return SDValue();
23088
23089 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
23090 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
23091 N->getOperand(2), Splat, DAG.getCondCode(CC));
23092 }
23093
23094 return SDValue();
23095}
23096
23099 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23100
23101 SDLoc DL(Op);
23102 assert(Op.getValueType().isScalableVector() &&
23103 TLI.isTypeLegal(Op.getValueType()) &&
23104 "Expected legal scalable vector type!");
23105 assert(Op.getValueType() == Pg.getValueType() &&
23106 "Expected same type for PTEST operands");
23107
23108 // Ensure target specific opcodes are using legal type.
23109 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
23110 SDValue TVal = DAG.getConstant(1, DL, OutVT);
23111 SDValue FVal = DAG.getConstant(0, DL, OutVT);
23112
23113 // Ensure operands have type nxv16i1.
23114 if (Op.getValueType() != MVT::nxv16i1) {
23117 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
23118 else
23119 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
23120 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
23121 }
23122
23123 unsigned PTest = AArch64ISD::PTEST;
23125 PTest = AArch64ISD::PTEST_ANY;
23126 else if (Cond == AArch64CC::FIRST_ACTIVE)
23127 PTest = AArch64ISD::PTEST_FIRST;
23128
23129 // Set condition code (CC) flags.
23130 SDValue Test = DAG.getNode(PTest, DL, MVT::i32, Pg, Op);
23131
23132 // Convert CC to integer based on requested condition.
23133 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
23134 SDValue CC = getCondCode(DAG, getInvertedCondCode(Cond));
23135 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
23136 return DAG.getZExtOrTrunc(Res, DL, VT);
23137}
23138
23140 SelectionDAG &DAG) {
23141 SDLoc DL(N);
23142
23143 SDValue Pred = N->getOperand(1);
23144 SDValue VecToReduce = N->getOperand(2);
23145
23146 // NOTE: The integer reduction's result type is not always linked to the
23147 // operand's element type so we construct it from the intrinsic's result type.
23148 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
23149 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
23150
23151 // SVE reductions set the whole vector register with the first element
23152 // containing the reduction result, which we'll now extract.
23153 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
23154 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
23155 Zero);
23156}
23157
23159 SelectionDAG &DAG) {
23160 SDLoc DL(N);
23161
23162 SDValue Pred = N->getOperand(1);
23163 SDValue VecToReduce = N->getOperand(2);
23164
23165 EVT ReduceVT = VecToReduce.getValueType();
23166 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
23167
23168 // SVE reductions set the whole vector register with the first element
23169 // containing the reduction result, which we'll now extract.
23170 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
23171 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
23172 Zero);
23173}
23174
23176 SelectionDAG &DAG) {
23177 SDLoc DL(N);
23178
23179 SDValue Pred = N->getOperand(1);
23180 SDValue InitVal = N->getOperand(2);
23181 SDValue VecToReduce = N->getOperand(3);
23182 EVT ReduceVT = VecToReduce.getValueType();
23183
23184 // Ordered reductions use the first lane of the result vector as the
23185 // reduction's initial value.
23186 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
23187 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
23188 DAG.getUNDEF(ReduceVT), InitVal, Zero);
23189
23190 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
23191
23192 // SVE reductions set the whole vector register with the first element
23193 // containing the reduction result, which we'll now extract.
23194 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
23195 Zero);
23196}
23197
23199 SelectionDAG &DAG) {
23200 if (N->getValueType(0) != MVT::i16)
23201 return SDValue();
23202
23203 SDLoc DL(N);
23204 SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1));
23205 SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT);
23206 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast);
23207}
23208
23209// If a merged operation has no inactive lanes we can relax it to a predicated
23210// or unpredicated operation, which potentially allows better isel (perhaps
23211// using immediate forms) or relaxing register reuse requirements.
23213 SelectionDAG &DAG, bool UnpredOp = false,
23214 bool SwapOperands = false) {
23215 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
23216 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
23217 SDValue Pg = N->getOperand(1);
23218 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
23219 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
23220
23221 // ISD way to specify an all active predicate.
23222 if (isAllActivePredicate(DAG, Pg)) {
23223 if (UnpredOp)
23224 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
23225
23226 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
23227 }
23228
23229 // FUTURE: SplatVector(true)
23230 return SDValue();
23231}
23232
23233static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG) {
23234 SDLoc DL(N);
23235 EVT VT = N->getValueType(0);
23236 SDValue Op1 = N->getOperand(1);
23237 SDValue Op2 = N->getOperand(2);
23238 SDValue Op3 = N->getOperand(3);
23239
23240 switch (IID) {
23241 default:
23242 llvm_unreachable("Called with wrong intrinsic!");
23243 case Intrinsic::aarch64_sve_bsl:
23244 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2);
23245 case Intrinsic::aarch64_sve_bsl1n:
23246 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, DAG.getNOT(DL, Op1, VT),
23247 Op2);
23248 case Intrinsic::aarch64_sve_bsl2n:
23249 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1,
23250 DAG.getNOT(DL, Op2, VT));
23251 case Intrinsic::aarch64_sve_nbsl:
23252 return DAG.getNOT(DL, DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2),
23253 VT);
23254 }
23255}
23256
23259 const AArch64Subtarget *Subtarget) {
23260 SelectionDAG &DAG = DCI.DAG;
23261 unsigned IID = getIntrinsicID(N);
23262 switch (IID) {
23263 default:
23264 break;
23265 case Intrinsic::aarch64_neon_vcvtfxs2fp:
23266 case Intrinsic::aarch64_neon_vcvtfxu2fp:
23267 return tryCombineFixedPointConvert(N, DCI, DAG);
23268 case Intrinsic::aarch64_neon_saddv:
23269 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
23270 case Intrinsic::aarch64_neon_uaddv:
23271 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
23272 case Intrinsic::aarch64_neon_sminv:
23273 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
23274 case Intrinsic::aarch64_neon_uminv:
23275 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
23276 case Intrinsic::aarch64_neon_smaxv:
23277 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
23278 case Intrinsic::aarch64_neon_umaxv:
23279 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
23280 case Intrinsic::aarch64_neon_fmax:
23281 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
23282 N->getOperand(1), N->getOperand(2));
23283 case Intrinsic::aarch64_neon_fmin:
23284 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
23285 N->getOperand(1), N->getOperand(2));
23286 case Intrinsic::aarch64_neon_fmaxnm:
23287 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
23288 N->getOperand(1), N->getOperand(2));
23289 case Intrinsic::aarch64_neon_fminnm:
23290 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
23291 N->getOperand(1), N->getOperand(2));
23292 case Intrinsic::aarch64_neon_smull:
23293 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
23294 N->getOperand(1), N->getOperand(2));
23295 case Intrinsic::aarch64_neon_umull:
23296 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
23297 N->getOperand(1), N->getOperand(2));
23298 case Intrinsic::aarch64_neon_pmull:
23299 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
23300 N->getOperand(1), N->getOperand(2));
23301 case Intrinsic::aarch64_neon_sqdmull:
23302 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
23303 case Intrinsic::aarch64_neon_sqshl:
23304 case Intrinsic::aarch64_neon_uqshl:
23305 case Intrinsic::aarch64_neon_sqshlu:
23306 case Intrinsic::aarch64_neon_srshl:
23307 case Intrinsic::aarch64_neon_urshl:
23308 case Intrinsic::aarch64_neon_sshl:
23309 case Intrinsic::aarch64_neon_ushl:
23310 return tryCombineShiftImm(IID, N, DAG);
23311 case Intrinsic::aarch64_neon_sabd:
23312 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
23313 N->getOperand(1), N->getOperand(2));
23314 case Intrinsic::aarch64_neon_uabd:
23315 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
23316 N->getOperand(1), N->getOperand(2));
23317 case Intrinsic::aarch64_neon_fcvtzs:
23318 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG);
23319 case Intrinsic::aarch64_neon_fcvtzu:
23320 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZU_HALF, DAG);
23321 case Intrinsic::aarch64_neon_fcvtas:
23322 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAS_HALF, DAG);
23323 case Intrinsic::aarch64_neon_fcvtau:
23324 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAU_HALF, DAG);
23325 case Intrinsic::aarch64_neon_fcvtms:
23326 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMS_HALF, DAG);
23327 case Intrinsic::aarch64_neon_fcvtmu:
23328 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMU_HALF, DAG);
23329 case Intrinsic::aarch64_neon_fcvtns:
23330 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNS_HALF, DAG);
23331 case Intrinsic::aarch64_neon_fcvtnu:
23332 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNU_HALF, DAG);
23333 case Intrinsic::aarch64_neon_fcvtps:
23334 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPS_HALF, DAG);
23335 case Intrinsic::aarch64_neon_fcvtpu:
23336 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPU_HALF, DAG);
23337 case Intrinsic::aarch64_crc32b:
23338 case Intrinsic::aarch64_crc32cb:
23339 return tryCombineCRC32(0xff, N, DAG);
23340 case Intrinsic::aarch64_crc32h:
23341 case Intrinsic::aarch64_crc32ch:
23342 return tryCombineCRC32(0xffff, N, DAG);
23343 case Intrinsic::aarch64_sve_saddv:
23344 // There is no i64 version of SADDV because the sign is irrelevant.
23345 if (N->getOperand(2).getValueType().getVectorElementType() == MVT::i64)
23346 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
23347 else
23348 return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
23349 case Intrinsic::aarch64_sve_uaddv:
23350 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
23351 case Intrinsic::aarch64_sve_smaxv:
23352 return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
23353 case Intrinsic::aarch64_sve_umaxv:
23354 return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
23355 case Intrinsic::aarch64_sve_sminv:
23356 return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
23357 case Intrinsic::aarch64_sve_uminv:
23358 return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
23359 case Intrinsic::aarch64_sve_orv:
23360 return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
23361 case Intrinsic::aarch64_sve_eorv:
23362 return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
23363 case Intrinsic::aarch64_sve_andv:
23364 return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
23365 case Intrinsic::aarch64_sve_index:
23366 return LowerSVEIntrinsicIndex(N, DAG);
23367 case Intrinsic::aarch64_sve_dup:
23368 return LowerSVEIntrinsicDUP(N, DAG);
23369 case Intrinsic::aarch64_sve_dup_x:
23370 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
23371 N->getOperand(1));
23372 case Intrinsic::aarch64_sve_ext:
23373 return LowerSVEIntrinsicEXT(N, DAG);
23374 case Intrinsic::aarch64_sve_mul_u:
23375 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
23376 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23377 case Intrinsic::aarch64_sve_smulh_u:
23378 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
23379 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23380 case Intrinsic::aarch64_sve_umulh_u:
23381 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
23382 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23383 case Intrinsic::aarch64_sve_smin_u:
23384 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
23385 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23386 case Intrinsic::aarch64_sve_umin_u:
23387 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
23388 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23389 case Intrinsic::aarch64_sve_smax_u:
23390 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
23391 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23392 case Intrinsic::aarch64_sve_umax_u:
23393 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
23394 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23395 case Intrinsic::aarch64_sve_lsl_u:
23396 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
23397 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23398 case Intrinsic::aarch64_sve_lsr_u:
23399 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
23400 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23401 case Intrinsic::aarch64_sve_asr_u:
23402 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
23403 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23404 case Intrinsic::aarch64_sve_fadd_u:
23405 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
23406 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23407 case Intrinsic::aarch64_sve_fdiv_u:
23408 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
23409 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23410 case Intrinsic::aarch64_sve_fmax_u:
23411 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
23412 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23413 case Intrinsic::aarch64_sve_fmaxnm_u:
23414 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
23415 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23416 case Intrinsic::aarch64_sve_fmla_u:
23417 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
23418 N->getOperand(1), N->getOperand(3), N->getOperand(4),
23419 N->getOperand(2));
23420 case Intrinsic::aarch64_sve_fmin_u:
23421 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
23422 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23423 case Intrinsic::aarch64_sve_fminnm_u:
23424 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
23425 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23426 case Intrinsic::aarch64_sve_fmul_u:
23427 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
23428 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23429 case Intrinsic::aarch64_sve_fsub_u:
23430 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
23431 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23432 case Intrinsic::aarch64_sve_add_u:
23433 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
23434 N->getOperand(3));
23435 case Intrinsic::aarch64_sve_sub_u:
23436 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
23437 N->getOperand(3));
23438 case Intrinsic::aarch64_sve_subr:
23439 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
23440 case Intrinsic::aarch64_sve_and_u:
23441 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
23442 N->getOperand(3));
23443 case Intrinsic::aarch64_sve_bic_u:
23444 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
23445 N->getOperand(2), N->getOperand(3));
23446 case Intrinsic::aarch64_sve_saddwb:
23447 return DAG.getNode(AArch64ISD::SADDWB, SDLoc(N), N->getValueType(0),
23448 N->getOperand(1), N->getOperand(2));
23449 case Intrinsic::aarch64_sve_saddwt:
23450 return DAG.getNode(AArch64ISD::SADDWT, SDLoc(N), N->getValueType(0),
23451 N->getOperand(1), N->getOperand(2));
23452 case Intrinsic::aarch64_sve_uaddwb:
23453 return DAG.getNode(AArch64ISD::UADDWB, SDLoc(N), N->getValueType(0),
23454 N->getOperand(1), N->getOperand(2));
23455 case Intrinsic::aarch64_sve_uaddwt:
23456 return DAG.getNode(AArch64ISD::UADDWT, SDLoc(N), N->getValueType(0),
23457 N->getOperand(1), N->getOperand(2));
23458 case Intrinsic::aarch64_sve_eor_u:
23459 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
23460 N->getOperand(3));
23461 case Intrinsic::aarch64_sve_orr_u:
23462 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
23463 N->getOperand(3));
23464 case Intrinsic::aarch64_sve_sabd_u:
23465 if (SDValue V = convertMergedOpToPredOp(N, ISD::ABDS, DAG, true))
23466 return V;
23467 return DAG.getNode(AArch64ISD::ABDS_PRED, SDLoc(N), N->getValueType(0),
23468 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23469 case Intrinsic::aarch64_sve_uabd_u:
23470 if (SDValue V = convertMergedOpToPredOp(N, ISD::ABDU, DAG, true))
23471 return V;
23472 return DAG.getNode(AArch64ISD::ABDU_PRED, SDLoc(N), N->getValueType(0),
23473 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23474 case Intrinsic::aarch64_sve_sdiv_u:
23475 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
23476 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23477 case Intrinsic::aarch64_sve_udiv_u:
23478 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
23479 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23480 case Intrinsic::aarch64_sve_sqadd:
23481 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
23482 case Intrinsic::aarch64_sve_sqsub_u:
23483 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
23484 N->getOperand(2), N->getOperand(3));
23485 case Intrinsic::aarch64_sve_uqadd:
23486 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
23487 case Intrinsic::aarch64_sve_uqsub_u:
23488 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
23489 N->getOperand(2), N->getOperand(3));
23490 case Intrinsic::aarch64_sve_sqadd_x:
23491 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
23492 N->getOperand(1), N->getOperand(2));
23493 case Intrinsic::aarch64_sve_sqsub_x:
23494 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
23495 N->getOperand(1), N->getOperand(2));
23496 case Intrinsic::aarch64_sve_uqadd_x:
23497 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
23498 N->getOperand(1), N->getOperand(2));
23499 case Intrinsic::aarch64_sve_uqsub_x:
23500 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
23501 N->getOperand(1), N->getOperand(2));
23502 case Intrinsic::aarch64_sve_asrd:
23503 return DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, SDLoc(N), N->getValueType(0),
23504 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23505 case Intrinsic::aarch64_sve_cmphs:
23506 if (!N->getOperand(2).getValueType().isFloatingPoint())
23507 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
23508 N->getValueType(0), N->getOperand(1), N->getOperand(2),
23509 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
23510 break;
23511 case Intrinsic::aarch64_sve_cmphi:
23512 if (!N->getOperand(2).getValueType().isFloatingPoint())
23513 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
23514 N->getValueType(0), N->getOperand(1), N->getOperand(2),
23515 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
23516 break;
23517 case Intrinsic::aarch64_sve_fcmpge:
23518 case Intrinsic::aarch64_sve_cmpge:
23519 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
23520 N->getValueType(0), N->getOperand(1), N->getOperand(2),
23521 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
23522 break;
23523 case Intrinsic::aarch64_sve_fcmpgt:
23524 case Intrinsic::aarch64_sve_cmpgt:
23525 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
23526 N->getValueType(0), N->getOperand(1), N->getOperand(2),
23527 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
23528 break;
23529 case Intrinsic::aarch64_sve_fcmpeq:
23530 case Intrinsic::aarch64_sve_cmpeq:
23531 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
23532 N->getValueType(0), N->getOperand(1), N->getOperand(2),
23533 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
23534 break;
23535 case Intrinsic::aarch64_sve_fcmpne:
23536 case Intrinsic::aarch64_sve_cmpne:
23537 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
23538 N->getValueType(0), N->getOperand(1), N->getOperand(2),
23539 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
23540 break;
23541 case Intrinsic::aarch64_sve_fcmpuo:
23542 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
23543 N->getValueType(0), N->getOperand(1), N->getOperand(2),
23544 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
23545 break;
23546 case Intrinsic::aarch64_sve_fadda:
23547 return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
23548 case Intrinsic::aarch64_sve_faddv:
23549 return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
23550 case Intrinsic::aarch64_sve_fmaxnmv:
23551 return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
23552 case Intrinsic::aarch64_sve_fmaxv:
23553 return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
23554 case Intrinsic::aarch64_sve_fminnmv:
23555 return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
23556 case Intrinsic::aarch64_sve_fminv:
23557 return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
23558 case Intrinsic::aarch64_sve_sel:
23559 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
23560 N->getOperand(1), N->getOperand(2), N->getOperand(3));
23561 case Intrinsic::aarch64_sve_cmpeq_wide:
23562 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
23563 case Intrinsic::aarch64_sve_cmpne_wide:
23564 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
23565 case Intrinsic::aarch64_sve_cmpge_wide:
23566 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
23567 case Intrinsic::aarch64_sve_cmpgt_wide:
23568 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
23569 case Intrinsic::aarch64_sve_cmplt_wide:
23570 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
23571 case Intrinsic::aarch64_sve_cmple_wide:
23572 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
23573 case Intrinsic::aarch64_sve_cmphs_wide:
23574 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
23575 case Intrinsic::aarch64_sve_cmphi_wide:
23576 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
23577 case Intrinsic::aarch64_sve_cmplo_wide:
23578 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
23579 case Intrinsic::aarch64_sve_cmpls_wide:
23580 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
23581 case Intrinsic::aarch64_sve_ptest_any:
23582 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
23584 case Intrinsic::aarch64_sve_ptest_first:
23585 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
23587 case Intrinsic::aarch64_sve_ptest_last:
23588 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
23590 case Intrinsic::aarch64_sve_whilelo:
23591 return DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, SDLoc(N), N->getValueType(0),
23592 N->getOperand(1), N->getOperand(2));
23593 case Intrinsic::aarch64_sve_bsl:
23594 case Intrinsic::aarch64_sve_bsl1n:
23595 case Intrinsic::aarch64_sve_bsl2n:
23596 case Intrinsic::aarch64_sve_nbsl:
23597 return combineSVEBitSel(IID, N, DAG);
23598 }
23599 return SDValue();
23600}
23601
23602static bool isCheapToExtend(const SDValue &N) {
23603 unsigned OC = N->getOpcode();
23604 return OC == ISD::LOAD || OC == ISD::MLOAD ||
23606}
23607
23608static SDValue
23610 SelectionDAG &DAG) {
23611 // If we have (sext (setcc A B)) and A and B are cheap to extend,
23612 // we can move the sext into the arguments and have the same result. For
23613 // example, if A and B are both loads, we can make those extending loads and
23614 // avoid an extra instruction. This pattern appears often in VLS code
23615 // generation where the inputs to the setcc have a different size to the
23616 // instruction that wants to use the result of the setcc.
23617 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
23618 N->getOperand(0)->getOpcode() == ISD::SETCC);
23619 const SDValue SetCC = N->getOperand(0);
23620
23621 const SDValue CCOp0 = SetCC.getOperand(0);
23622 const SDValue CCOp1 = SetCC.getOperand(1);
23623 if (!CCOp0->getValueType(0).isInteger() ||
23624 !CCOp1->getValueType(0).isInteger())
23625 return SDValue();
23626
23627 ISD::CondCode Code =
23628 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
23629
23630 ISD::NodeType ExtType =
23631 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23632
23633 if (isCheapToExtend(SetCC.getOperand(0)) &&
23634 isCheapToExtend(SetCC.getOperand(1))) {
23635 const SDValue Ext1 =
23636 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
23637 const SDValue Ext2 =
23638 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
23639
23640 return DAG.getSetCC(
23641 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
23642 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
23643 }
23644
23645 return SDValue();
23646}
23647
23648// Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255)
23649// This comes from interleaved vectorization. It is performed late to capture
23650// uitofp converts too.
23652 SelectionDAG &DAG) {
23653 EVT VT = N->getValueType(0);
23654 if ((VT != MVT::v4i32 && VT != MVT::v8i16) ||
23655 N->getOpcode() != ISD::ZERO_EXTEND ||
23656 N->getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
23657 return SDValue();
23658
23659 unsigned ExtOffset = N->getOperand(0).getConstantOperandVal(1);
23660 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
23661 return SDValue();
23662
23663 EVT InVT = N->getOperand(0).getOperand(0).getValueType();
23664 auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0).getOperand(0));
23665 if (!Shuffle ||
23666 InVT.getVectorNumElements() != VT.getVectorNumElements() * 2 ||
23667 InVT.getScalarSizeInBits() * 2 != VT.getScalarSizeInBits())
23668 return SDValue();
23669
23670 unsigned Idx;
23672 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements()), 4, Idx);
23673 // An undef interleave shuffle can come up after other canonicalizations,
23674 // where the shuffle has been converted to
23675 // zext(extract(shuffle b, undef, [u,u,0,4]))
23676 bool IsUndefDeInterleave = false;
23677 if (!IsDeInterleave)
23678 IsUndefDeInterleave =
23679 Shuffle->getOperand(1).isUndef() &&
23680 all_of(
23681 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements() / 2),
23682 [](int M) { return M < 0; }) &&
23684 Shuffle->getMask().slice(ExtOffset + VT.getVectorNumElements() / 2,
23685 VT.getVectorNumElements() / 2),
23686 4, Idx);
23687 if ((!IsDeInterleave && !IsUndefDeInterleave) || Idx >= 4)
23688 return SDValue();
23689 SDLoc DL(N);
23690 SDValue BC1 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
23691 Shuffle->getOperand(IsUndefDeInterleave ? 1 : 0));
23692 SDValue BC2 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
23693 Shuffle->getOperand(IsUndefDeInterleave ? 0 : 1));
23694 SDValue UZP = DAG.getNode(Idx < 2 ? AArch64ISD::UZP1 : AArch64ISD::UZP2, DL,
23695 VT, BC1, BC2);
23696 if ((Idx & 1) == 1)
23697 UZP = DAG.getNode(ISD::SRL, DL, VT, UZP,
23698 DAG.getConstant(InVT.getScalarSizeInBits(), DL, VT));
23699 return DAG.getNode(
23700 ISD::AND, DL, VT, UZP,
23701 DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT));
23702}
23703
23704// This comes up similar to the above when lowering deinterleaving shuffles from
23705// zexts. We have legalized the operations in the generally case to
23706// zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if
23707// the extract is to the low half and the uzp is uzp1. There would be an extra
23708// shift if the uzp was uzp2 to grab the upper half. Due to the combine above
23709// there could also be an existing and / shift that can be combined in, either
23710// before of after the extract.
23712 EVT VT = N->getValueType(0);
23713 if (N->getOpcode() != ISD::ZERO_EXTEND ||
23714 (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16))
23715 return SDValue();
23716
23717 SDValue Op = N->getOperand(0);
23718 unsigned ExtOffset = (unsigned)-1;
23719 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23720 ExtOffset = Op.getConstantOperandVal(1);
23721 Op = Op.getOperand(0);
23722 }
23723
23724 unsigned Shift = 0;
23726 Op.getValueType().getScalarSizeInBits());
23727
23728 if (Op.getOpcode() == AArch64ISD::VLSHR) {
23729 Shift = Op.getConstantOperandVal(1);
23730 Op = Op.getOperand(0);
23731 Mask = Mask.lshr(Shift);
23732 }
23733 if (Op.getOpcode() == ISD::AND &&
23734 ISD::isConstantSplatVector(Op.getOperand(1).getNode(), Mask)) {
23735 Op = Op.getOperand(0);
23736 Mask = Mask.zext(VT.getScalarSizeInBits());
23737 } else if (Op.getOpcode() == AArch64ISD::BICi) {
23738 Mask = ~APInt(Op.getValueType().getScalarSizeInBits(),
23739 Op.getConstantOperandVal(1) << Op.getConstantOperandVal(2));
23740 Mask = Mask.zext(VT.getScalarSizeInBits());
23741 Op = Op.getOperand(0);
23742 }
23743
23744 if (ExtOffset == (unsigned)-1) {
23745 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23746 ExtOffset = Op.getConstantOperandVal(1);
23747 Op = Op.getOperand(0);
23748 } else
23749 return SDValue();
23750 }
23751 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
23752 return SDValue();
23753
23754 if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2)
23755 return SDValue();
23756 if (Op.getOpcode() == AArch64ISD::UZP2)
23757 Shift += VT.getScalarSizeInBits() / 2;
23758
23759 SDLoc DL(N);
23760 SDValue BC = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
23761 Op.getOperand(ExtOffset == 0 ? 0 : 1));
23762 if (Shift != 0)
23763 BC = DAG.getNode(AArch64ISD::VLSHR, DL, VT, BC,
23764 DAG.getTargetConstant(Shift, DL, MVT::i32));
23765 return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
23766}
23767
23770 SelectionDAG &DAG) {
23771 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
23772 // we can convert that DUP into another extract_high (of a bigger DUP), which
23773 // helps the backend to decide that an sabdl2 would be useful, saving a real
23774 // extract_high operation.
23775 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
23776 N->getOperand(0).getValueType().is64BitVector() &&
23777 (N->getOperand(0).getOpcode() == ISD::ABDU ||
23778 N->getOperand(0).getOpcode() == ISD::ABDS)) {
23779 SDNode *ABDNode = N->getOperand(0).getNode();
23780 SDValue NewABD =
23782 if (!NewABD.getNode())
23783 return SDValue();
23784
23785 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
23786 }
23787
23789 return R;
23790 if (SDValue R = performZExtUZPCombine(N, DAG))
23791 return R;
23792
23793 if (N->getValueType(0).isFixedLengthVector() &&
23794 N->getOpcode() == ISD::SIGN_EXTEND &&
23795 N->getOperand(0)->getOpcode() == ISD::SETCC)
23796 return performSignExtendSetCCCombine(N, DCI, DAG);
23797
23798 // If we see (any_extend (bswap ...)) with bswap returning an i16, we know
23799 // that the top half of the result register must be unused, due to the
23800 // any_extend. This means that we can replace this pattern with (rev16
23801 // (any_extend ...)). This saves a machine instruction compared to (lsr (rev
23802 // ...)), which is what this pattern would otherwise be lowered to.
23803 // Only apply this optimisation if any_extend in original pattern to i32 or
23804 // i64, because this type will become the input type to REV16 in the new
23805 // pattern, so must be a legitimate REV16 input type.
23806 SDValue Bswap = N->getOperand(0);
23807 if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
23808 Bswap.getValueType() == MVT::i16 &&
23809 (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {
23810 SDLoc DL(N);
23811 SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0),
23812 Bswap->getOperand(0));
23813 return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),
23814 NewAnyExtend);
23815 }
23816
23817 return SDValue();
23818}
23819
23821 SDValue SplatVal, unsigned NumVecElts) {
23822 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
23823 Align OrigAlignment = St.getAlign();
23824 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
23825
23826 // Create scalar stores. This is at least as good as the code sequence for a
23827 // split unaligned store which is a dup.s, ext.b, and two stores.
23828 // Most of the time the three stores should be replaced by store pair
23829 // instructions (stp).
23830 SDLoc DL(&St);
23831 SDValue BasePtr = St.getBasePtr();
23832 uint64_t BaseOffset = 0;
23833
23834 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
23835 SDValue NewST1 =
23836 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
23837 OrigAlignment, St.getMemOperand()->getFlags());
23838
23839 // As this in ISel, we will not merge this add which may degrade results.
23840 if (BasePtr->getOpcode() == ISD::ADD &&
23841 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
23842 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
23843 BasePtr = BasePtr->getOperand(0);
23844 }
23845
23846 unsigned Offset = EltOffset;
23847 while (--NumVecElts) {
23848 Align Alignment = commonAlignment(OrigAlignment, Offset);
23849 SDValue OffsetPtr =
23850 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
23851 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
23852 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
23853 PtrInfo.getWithOffset(Offset), Alignment,
23854 St.getMemOperand()->getFlags());
23855 Offset += EltOffset;
23856 }
23857 return NewST1;
23858}
23859
23860// Returns an SVE type that ContentTy can be trivially sign or zero extended
23861// into.
23862static MVT getSVEContainerType(EVT ContentTy) {
23863 assert(ContentTy.isSimple() && "No SVE containers for extended types");
23864
23865 switch (ContentTy.getSimpleVT().SimpleTy) {
23866 default:
23867 llvm_unreachable("No known SVE container for this MVT type");
23868 case MVT::nxv2i8:
23869 case MVT::nxv2i16:
23870 case MVT::nxv2i32:
23871 case MVT::nxv2i64:
23872 case MVT::nxv2f32:
23873 case MVT::nxv2f64:
23874 return MVT::nxv2i64;
23875 case MVT::nxv4i8:
23876 case MVT::nxv4i16:
23877 case MVT::nxv4i32:
23878 case MVT::nxv4f32:
23879 return MVT::nxv4i32;
23880 case MVT::nxv8i8:
23881 case MVT::nxv8i16:
23882 case MVT::nxv8f16:
23883 case MVT::nxv8bf16:
23884 return MVT::nxv8i16;
23885 case MVT::nxv16i8:
23886 return MVT::nxv16i8;
23887 }
23888}
23889
23891 SDLoc DL(N);
23892 EVT VT = N->getValueType(0);
23893
23895 return SDValue();
23896
23897 EVT ContainerVT = VT;
23898 if (ContainerVT.isInteger())
23899 ContainerVT = getSVEContainerType(ContainerVT);
23900
23901 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
23902 SDValue Ops[] = { N->getOperand(0), // Chain
23903 N->getOperand(2), // Pg
23904 N->getOperand(3), // Base
23905 DAG.getValueType(VT) };
23906
23907 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
23908 SDValue LoadChain = SDValue(Load.getNode(), 1);
23909
23910 if (ContainerVT.isInteger() && (VT != ContainerVT))
23911 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
23912
23913 return DAG.getMergeValues({ Load, LoadChain }, DL);
23914}
23915
23917 SDLoc DL(N);
23918 EVT VT = N->getValueType(0);
23919 EVT PtrTy = N->getOperand(3).getValueType();
23920
23921 EVT LoadVT = VT;
23922 if (VT.isFloatingPoint())
23923 LoadVT = VT.changeTypeToInteger();
23924
23925 auto *MINode = cast<MemIntrinsicSDNode>(N);
23926 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
23927 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
23928 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
23929 MINode->getOperand(2), PassThru,
23930 MINode->getMemoryVT(), MINode->getMemOperand(),
23932
23933 if (VT.isFloatingPoint()) {
23934 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
23935 return DAG.getMergeValues(Ops, DL);
23936 }
23937
23938 return L;
23939}
23940
23941template <unsigned Opcode>
23943 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
23944 Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
23945 "Unsupported opcode.");
23946 SDLoc DL(N);
23947 EVT VT = N->getValueType(0);
23948
23949 EVT LoadVT = VT;
23950 if (VT.isFloatingPoint())
23951 LoadVT = VT.changeTypeToInteger();
23952
23953 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
23954 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
23955 SDValue LoadChain = SDValue(Load.getNode(), 1);
23956
23957 if (VT.isFloatingPoint())
23958 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
23959
23960 return DAG.getMergeValues({Load, LoadChain}, DL);
23961}
23962
23964 SDLoc DL(N);
23965 SDValue Data = N->getOperand(2);
23966 EVT DataVT = Data.getValueType();
23967 EVT HwSrcVt = getSVEContainerType(DataVT);
23968 SDValue InputVT = DAG.getValueType(DataVT);
23969
23970 if (DataVT.isFloatingPoint())
23971 InputVT = DAG.getValueType(HwSrcVt);
23972
23973 SDValue SrcNew;
23974 if (Data.getValueType().isFloatingPoint())
23975 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
23976 else
23977 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
23978
23979 SDValue Ops[] = { N->getOperand(0), // Chain
23980 SrcNew,
23981 N->getOperand(4), // Base
23982 N->getOperand(3), // Pg
23983 InputVT
23984 };
23985
23986 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
23987}
23988
23990 SDLoc DL(N);
23991
23992 SDValue Data = N->getOperand(2);
23993 EVT DataVT = Data.getValueType();
23994 EVT PtrTy = N->getOperand(4).getValueType();
23995
23996 if (DataVT.isFloatingPoint())
23997 Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
23998
23999 auto *MINode = cast<MemIntrinsicSDNode>(N);
24000 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
24001 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
24002 MINode->getMemoryVT(), MINode->getMemOperand(),
24003 ISD::UNINDEXED, false, false);
24004}
24005
24006/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
24007/// load store optimizer pass will merge them to store pair stores. This should
24008/// be better than a movi to create the vector zero followed by a vector store
24009/// if the zero constant is not re-used, since one instructions and one register
24010/// live range will be removed.
24011///
24012/// For example, the final generated code should be:
24013///
24014/// stp xzr, xzr, [x0]
24015///
24016/// instead of:
24017///
24018/// movi v0.2d, #0
24019/// str q0, [x0]
24020///
24022 SDValue StVal = St.getValue();
24023 EVT VT = StVal.getValueType();
24024
24025 // Avoid scalarizing zero splat stores for scalable vectors.
24026 if (VT.isScalableVector())
24027 return SDValue();
24028
24029 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
24030 // 2, 3 or 4 i32 elements.
24031 int NumVecElts = VT.getVectorNumElements();
24032 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
24033 VT.getVectorElementType().getSizeInBits() == 64) ||
24034 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
24035 VT.getVectorElementType().getSizeInBits() == 32)))
24036 return SDValue();
24037
24038 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
24039 return SDValue();
24040
24041 // If the zero constant has more than one use then the vector store could be
24042 // better since the constant mov will be amortized and stp q instructions
24043 // should be able to be formed.
24044 if (!StVal.hasOneUse())
24045 return SDValue();
24046
24047 // If the store is truncating then it's going down to i16 or smaller, which
24048 // means it can be implemented in a single store anyway.
24049 if (St.isTruncatingStore())
24050 return SDValue();
24051
24052 // If the immediate offset of the address operand is too large for the stp
24053 // instruction, then bail out.
24054 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
24055 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
24057 return SDValue();
24058 }
24059
24060 for (int I = 0; I < NumVecElts; ++I) {
24061 SDValue EltVal = StVal.getOperand(I);
24062 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
24063 return SDValue();
24064 }
24065
24066 // Use a CopyFromReg WZR/XZR here to prevent
24067 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
24068 SDLoc DL(&St);
24069 unsigned ZeroReg;
24070 EVT ZeroVT;
24071 if (VT.getVectorElementType().getSizeInBits() == 32) {
24072 ZeroReg = AArch64::WZR;
24073 ZeroVT = MVT::i32;
24074 } else {
24075 ZeroReg = AArch64::XZR;
24076 ZeroVT = MVT::i64;
24077 }
24078 SDValue SplatVal =
24079 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
24080 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
24081}
24082
24083/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
24084/// value. The load store optimizer pass will merge them to store pair stores.
24085/// This has better performance than a splat of the scalar followed by a split
24086/// vector store. Even if the stores are not merged it is four stores vs a dup,
24087/// followed by an ext.b and two stores.
24089 SDValue StVal = St.getValue();
24090 EVT VT = StVal.getValueType();
24091
24092 // Don't replace floating point stores, they possibly won't be transformed to
24093 // stp because of the store pair suppress pass.
24094 if (VT.isFloatingPoint())
24095 return SDValue();
24096
24097 // We can express a splat as store pair(s) for 2 or 4 elements.
24098 unsigned NumVecElts = VT.getVectorNumElements();
24099 if (NumVecElts != 4 && NumVecElts != 2)
24100 return SDValue();
24101
24102 // If the store is truncating then it's going down to i16 or smaller, which
24103 // means it can be implemented in a single store anyway.
24104 if (St.isTruncatingStore())
24105 return SDValue();
24106
24107 // Check that this is a splat.
24108 // Make sure that each of the relevant vector element locations are inserted
24109 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
24110 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
24111 SDValue SplatVal;
24112 for (unsigned I = 0; I < NumVecElts; ++I) {
24113 // Check for insert vector elements.
24114 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
24115 return SDValue();
24116
24117 // Check that same value is inserted at each vector element.
24118 if (I == 0)
24119 SplatVal = StVal.getOperand(1);
24120 else if (StVal.getOperand(1) != SplatVal)
24121 return SDValue();
24122
24123 // Check insert element index.
24125 if (!CIndex)
24126 return SDValue();
24127 uint64_t IndexVal = CIndex->getZExtValue();
24128 if (IndexVal >= NumVecElts)
24129 return SDValue();
24130 IndexNotInserted.reset(IndexVal);
24131
24132 StVal = StVal.getOperand(0);
24133 }
24134 // Check that all vector element locations were inserted to.
24135 if (IndexNotInserted.any())
24136 return SDValue();
24137
24138 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
24139}
24140
24142 SelectionDAG &DAG,
24143 const AArch64Subtarget *Subtarget) {
24144
24146 if (S->isVolatile() || S->isIndexed())
24147 return SDValue();
24148
24149 SDValue StVal = S->getValue();
24150 EVT VT = StVal.getValueType();
24151
24152 if (!VT.isFixedLengthVector())
24153 return SDValue();
24154
24155 // If we get a splat of zeros, convert this vector store to a store of
24156 // scalars. They will be merged into store pairs of xzr thereby removing one
24157 // instruction and one register.
24158 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
24159 return ReplacedZeroSplat;
24160
24161 // FIXME: The logic for deciding if an unaligned store should be split should
24162 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
24163 // a call to that function here.
24164
24165 if (!Subtarget->isMisaligned128StoreSlow())
24166 return SDValue();
24167
24168 // Don't split at -Oz.
24170 return SDValue();
24171
24172 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
24173 // those up regresses performance on micro-benchmarks and olden/bh.
24174 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
24175 return SDValue();
24176
24177 // Split unaligned 16B stores. They are terrible for performance.
24178 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
24179 // extensions can use this to mark that it does not want splitting to happen
24180 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
24181 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
24182 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
24183 S->getAlign() <= Align(2))
24184 return SDValue();
24185
24186 // If we get a splat of a scalar convert this vector store to a store of
24187 // scalars. They will be merged into store pairs thereby removing two
24188 // instructions.
24189 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
24190 return ReplacedSplat;
24191
24192 SDLoc DL(S);
24193
24194 // Split VT into two.
24195 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
24196 unsigned NumElts = HalfVT.getVectorNumElements();
24197 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
24198 DAG.getConstant(0, DL, MVT::i64));
24199 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
24200 DAG.getConstant(NumElts, DL, MVT::i64));
24201 SDValue BasePtr = S->getBasePtr();
24202 SDValue NewST1 =
24203 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
24204 S->getAlign(), S->getMemOperand()->getFlags());
24205 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
24206 DAG.getConstant(8, DL, MVT::i64));
24207 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
24208 S->getPointerInfo(), S->getAlign(),
24209 S->getMemOperand()->getFlags());
24210}
24211
24213 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexpected Opcode!");
24214
24215 // splice(pg, op1, undef) -> op1
24216 if (N->getOperand(2).isUndef())
24217 return N->getOperand(1);
24218
24219 return SDValue();
24220}
24221
24223 const AArch64Subtarget *Subtarget) {
24224 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
24225 N->getOpcode() == AArch64ISD::UUNPKLO) &&
24226 "Unexpected Opcode!");
24227
24228 // uunpklo/hi undef -> undef
24229 if (N->getOperand(0).isUndef())
24230 return DAG.getUNDEF(N->getValueType(0));
24231
24232 // If this is a masked load followed by an UUNPKLO, fold this into a masked
24233 // extending load. We can do this even if this is already a masked
24234 // {z,}extload.
24235 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
24236 N->getOpcode() == AArch64ISD::UUNPKLO) {
24237 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
24238 SDValue Mask = MLD->getMask();
24239 SDLoc DL(N);
24240
24241 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
24242 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
24243 (MLD->getPassThru()->isUndef() ||
24244 isZerosVector(MLD->getPassThru().getNode()))) {
24245 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
24246 unsigned PgPattern = Mask->getConstantOperandVal(0);
24247 EVT VT = N->getValueType(0);
24248
24249 // Ensure we can double the size of the predicate pattern
24250 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
24251 if (NumElts &&
24252 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
24253 Mask =
24254 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
24255 SDValue PassThru = DAG.getConstant(0, DL, VT);
24256 SDValue NewLoad = DAG.getMaskedLoad(
24257 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
24258 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
24260
24261 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
24262
24263 return NewLoad;
24264 }
24265 }
24266 }
24267
24268 return SDValue();
24269}
24270
24272 if (N->getOpcode() != AArch64ISD::UZP1)
24273 return false;
24274 SDValue Op0 = N->getOperand(0);
24275 EVT SrcVT = Op0->getValueType(0);
24276 EVT DstVT = N->getValueType(0);
24277 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
24278 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
24279 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
24280}
24281
24282// Try to combine rounding shifts where the operands come from an extend, and
24283// the result is truncated and combined into one vector.
24284// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
24286 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
24287 SDValue Op0 = N->getOperand(0);
24288 SDValue Op1 = N->getOperand(1);
24289 EVT ResVT = N->getValueType(0);
24290
24291 unsigned RshOpc = Op0.getOpcode();
24292 if (RshOpc != AArch64ISD::RSHRNB_I)
24293 return SDValue();
24294
24295 // Same op code and imm value?
24296 SDValue ShiftValue = Op0.getOperand(1);
24297 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
24298 return SDValue();
24299
24300 // Same unextended operand value?
24301 SDValue Lo = Op0.getOperand(0);
24302 SDValue Hi = Op1.getOperand(0);
24303 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
24304 Hi.getOpcode() != AArch64ISD::UUNPKHI)
24305 return SDValue();
24306 SDValue OrigArg = Lo.getOperand(0);
24307 if (OrigArg != Hi.getOperand(0))
24308 return SDValue();
24309
24310 SDLoc DL(N);
24311 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
24312 getPredicateForVector(DAG, DL, ResVT), OrigArg,
24313 ShiftValue);
24314}
24315
24316// Try to simplify:
24317// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
24318// t2 = nxv8i16 srl(t1, ShiftValue)
24319// to
24320// t1 = nxv8i16 rshrnb(X, shiftvalue).
24321// rshrnb will zero the top half bits of each element. Therefore, this combine
24322// should only be performed when a following instruction with the rshrnb
24323// as an operand does not care about the top half of each element. For example,
24324// a uzp1 or a truncating store.
24326 const AArch64Subtarget *Subtarget) {
24327 EVT VT = Srl->getValueType(0);
24328 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
24329 return SDValue();
24330
24331 EVT ResVT;
24332 if (VT == MVT::nxv8i16)
24333 ResVT = MVT::nxv16i8;
24334 else if (VT == MVT::nxv4i32)
24335 ResVT = MVT::nxv8i16;
24336 else if (VT == MVT::nxv2i64)
24337 ResVT = MVT::nxv4i32;
24338 else
24339 return SDValue();
24340
24341 SDLoc DL(Srl);
24342 unsigned ShiftValue;
24343 SDValue RShOperand;
24344 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
24345 return SDValue();
24346 SDValue Rshrnb = DAG.getNode(
24347 AArch64ISD::RSHRNB_I, DL, ResVT,
24348 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
24349 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Rshrnb);
24350}
24351
24353 if (V.getOpcode() != AArch64ISD::NVCAST)
24354 return SDValue();
24355
24356 SDValue Op = V.getOperand(0);
24357 if (!Op.getValueType().isVector() ||
24358 V.getValueType().getVectorElementCount() !=
24359 Op.getValueType().getVectorElementCount() * 2)
24360 return SDValue();
24361
24362 return Op;
24363}
24364
24366 const AArch64Subtarget *Subtarget) {
24367 SDLoc DL(N);
24368 SDValue Op0 = N->getOperand(0);
24369 SDValue Op1 = N->getOperand(1);
24370 EVT ResVT = N->getValueType(0);
24371
24372 // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
24373 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
24375 Op0.getOperand(0) == Op1.getOperand(0)) {
24376
24377 SDValue SourceVec = Op0.getOperand(0);
24378 uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
24379 uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
24380 uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
24381 if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
24382 EVT OpVT = Op0.getOperand(1).getValueType();
24383 EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
24384 SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
24385 DAG.getUNDEF(WidenedResVT));
24386 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
24387 DAG.getConstant(0, DL, OpVT));
24388 }
24389 }
24390
24391 // Following optimizations only work with uzp1.
24392 if (N->getOpcode() == AArch64ISD::UZP2)
24393 return SDValue();
24394
24395 // uzp1(x, undef) -> concat(truncate(x), undef)
24396 if (Op1.isUndef()) {
24397 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
24398 switch (ResVT.getSimpleVT().SimpleTy) {
24399 default:
24400 break;
24401 case MVT::v16i8:
24402 BCVT = MVT::v8i16;
24403 HalfVT = MVT::v8i8;
24404 break;
24405 case MVT::v8i16:
24406 BCVT = MVT::v4i32;
24407 HalfVT = MVT::v4i16;
24408 break;
24409 case MVT::v4i32:
24410 BCVT = MVT::v2i64;
24411 HalfVT = MVT::v2i32;
24412 break;
24413 }
24414 if (BCVT != MVT::Other) {
24415 SDValue BC = DAG.getBitcast(BCVT, Op0);
24416 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
24417 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
24418 DAG.getUNDEF(HalfVT));
24419 }
24420 }
24421
24422 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
24423 return Urshr;
24424
24425 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
24426 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
24427 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
24428 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
24429 }
24430 }
24431
24432 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
24433 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
24434 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
24435 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
24436 }
24437 }
24438
24439 // uzp1<ty>(nvcast(unpklo(uzp1<ty>(x, y))), z) => uzp1<ty>(x, z)
24440 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
24441 if (PreCast.getOpcode() == AArch64ISD::UUNPKLO) {
24442 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
24443 SDValue X = PreCast.getOperand(0).getOperand(0);
24444 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
24445 }
24446 }
24447 }
24448
24449 // uzp1<ty>(x, nvcast(unpkhi(uzp1<ty>(y, z)))) => uzp1<ty>(x, z)
24450 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
24451 if (PreCast.getOpcode() == AArch64ISD::UUNPKHI) {
24452 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
24453 SDValue Z = PreCast.getOperand(0).getOperand(1);
24454 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
24455 }
24456 }
24457 }
24458
24459 // These optimizations only work on little endian.
24460 if (!DAG.getDataLayout().isLittleEndian())
24461 return SDValue();
24462
24463 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
24464 // Example:
24465 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
24466 // to
24467 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
24469 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
24470 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
24471 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
24472 Op1.getOperand(0));
24473 }
24474 }
24475
24476 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
24477 return SDValue();
24478
24479 SDValue SourceOp0 = peekThroughBitcasts(Op0);
24480 SDValue SourceOp1 = peekThroughBitcasts(Op1);
24481
24482 // truncating uzp1(x, y) -> xtn(concat (x, y))
24483 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
24484 EVT Op0Ty = SourceOp0.getValueType();
24485 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
24486 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
24487 SDValue Concat =
24490 SourceOp0, SourceOp1);
24491 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
24492 }
24493 }
24494
24495 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
24496 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
24497 SourceOp1.getOpcode() != ISD::TRUNCATE)
24498 return SDValue();
24499 SourceOp0 = SourceOp0.getOperand(0);
24500 SourceOp1 = SourceOp1.getOperand(0);
24501
24502 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
24503 !SourceOp0.getValueType().isSimple())
24504 return SDValue();
24505
24506 EVT ResultTy;
24507
24508 switch (SourceOp0.getSimpleValueType().SimpleTy) {
24509 case MVT::v2i64:
24510 ResultTy = MVT::v4i32;
24511 break;
24512 case MVT::v4i32:
24513 ResultTy = MVT::v8i16;
24514 break;
24515 case MVT::v8i16:
24516 ResultTy = MVT::v16i8;
24517 break;
24518 default:
24519 return SDValue();
24520 }
24521
24522 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
24523 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
24524 SDValue UzpResult =
24525 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
24526
24527 EVT BitcastResultTy;
24528
24529 switch (ResVT.getSimpleVT().SimpleTy) {
24530 case MVT::v2i32:
24531 BitcastResultTy = MVT::v2i64;
24532 break;
24533 case MVT::v4i16:
24534 BitcastResultTy = MVT::v4i32;
24535 break;
24536 case MVT::v8i8:
24537 BitcastResultTy = MVT::v8i16;
24538 break;
24539 default:
24540 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
24541 }
24542
24543 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
24544 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
24545}
24546
24548 unsigned Opc = N->getOpcode();
24549
24550 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
24551 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
24552 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
24553 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
24554 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
24555 Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
24556 Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
24557 Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
24558
24559 SDLoc DL(N);
24560 SDValue Chain = N->getOperand(0);
24561 SDValue Pg = N->getOperand(1);
24562 SDValue Base = N->getOperand(2);
24563 SDValue Offset = N->getOperand(3);
24564 SDValue Ty = N->getOperand(4);
24565
24566 EVT ResVT = N->getValueType(0);
24567
24568 const auto OffsetOpc = Offset.getOpcode();
24569 const bool OffsetIsZExt =
24570 OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
24571 const bool OffsetIsSExt =
24572 OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
24573
24574 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
24575 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
24576 SDValue ExtPg = Offset.getOperand(0);
24577 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
24578 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
24579
24580 // If the predicate for the sign- or zero-extended offset is the
24581 // same as the predicate used for this load and the sign-/zero-extension
24582 // was from a 32-bits...
24583 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
24584 SDValue UnextendedOffset = Offset.getOperand(1);
24585
24586 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
24587 if (Signed)
24588 NewOpc = getSignExtendedGatherOpcode(NewOpc);
24589
24590 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
24591 {Chain, Pg, Base, UnextendedOffset, Ty});
24592 }
24593 }
24594
24595 return SDValue();
24596}
24597
24598/// Optimize a vector shift instruction and its operand if shifted out
24599/// bits are not used.
24601 const AArch64TargetLowering &TLI,
24603 assert(N->getOpcode() == AArch64ISD::VASHR ||
24604 N->getOpcode() == AArch64ISD::VLSHR);
24605
24606 SDValue Op = N->getOperand(0);
24607 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
24608
24609 unsigned ShiftImm = N->getConstantOperandVal(1);
24610 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
24611
24612 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
24613 if (N->getOpcode() == AArch64ISD::VASHR &&
24614 Op.getOpcode() == AArch64ISD::VSHL &&
24615 N->getOperand(1) == Op.getOperand(1))
24616 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
24617 return Op.getOperand(0);
24618
24619 // If the shift is exact, the shifted out bits matter.
24620 if (N->getFlags().hasExact())
24621 return SDValue();
24622
24623 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
24624 APInt DemandedMask = ~ShiftedOutBits;
24625
24626 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
24627 return SDValue(N, 0);
24628
24629 return SDValue();
24630}
24631
24633 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
24634 // This transform works in partnership with performSetCCPunpkCombine to
24635 // remove unnecessary transfer of predicates into standard registers and back
24636 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
24637 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
24638 MVT::i1) {
24639 SDValue CC = N->getOperand(0)->getOperand(0);
24640 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
24641 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
24642 DAG.getVectorIdxConstant(0, SDLoc(N)));
24643 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
24644 }
24645
24646 return SDValue();
24647}
24648
24649/// Target-specific DAG combine function for post-increment LD1 (lane) and
24650/// post-increment LD1R.
24653 bool IsLaneOp) {
24654 if (DCI.isBeforeLegalizeOps())
24655 return SDValue();
24656
24657 SelectionDAG &DAG = DCI.DAG;
24658 EVT VT = N->getValueType(0);
24659
24660 if (!VT.is128BitVector() && !VT.is64BitVector())
24661 return SDValue();
24662
24663 // If it is not LOAD, can not do such combine.
24664 unsigned LoadIdx = IsLaneOp ? 1 : 0;
24665 LoadSDNode *LD = dyn_cast<LoadSDNode>(N->getOperand(LoadIdx).getNode());
24666 if (!LD)
24667 return SDValue();
24668
24669 // If the Generic combiner already helped form a pre- or post-indexed load,
24670 // skip forming one here.
24671 if (LD->isIndexed())
24672 return SDValue();
24673
24674 // The vector lane must be a constant in the LD1LANE opcode.
24675 SDValue Lane;
24676 if (IsLaneOp) {
24677 Lane = N->getOperand(2);
24678 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
24679 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
24680 return SDValue();
24681 if (LaneC->getZExtValue() == 0 && isNullOrNullSplat(N->getOperand(0)))
24682 return SDValue();
24683 }
24684
24685 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
24686 EVT MemVT = LoadSDN->getMemoryVT();
24687 // Check if memory operand is the same type as the vector element.
24688 if (MemVT != VT.getVectorElementType())
24689 return SDValue();
24690
24691 // Check if there are other uses. If so, do not combine as it will introduce
24692 // an extra load.
24693 for (SDUse &U : LD->uses()) {
24694 if (U.getResNo() == 1) // Ignore uses of the chain result.
24695 continue;
24696 if (U.getUser() != N)
24697 return SDValue();
24698 }
24699
24700 // If there is one use and it can splat the value, prefer that operation.
24701 // TODO: This could be expanded to more operations if they reliably use the
24702 // index variants.
24703 if (N->hasOneUse()) {
24704 unsigned UseOpc = N->user_begin()->getOpcode();
24705 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
24706 return SDValue();
24707 }
24708
24709 SDValue Addr = LD->getOperand(1);
24710 SDValue Vector = N->getOperand(0);
24711 // Search for a use of the address operand that is an increment.
24712 for (SDUse &Use : Addr->uses()) {
24713 SDNode *User = Use.getUser();
24714 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
24715 continue;
24716
24717 // If the increment is a constant, it must match the memory ref size.
24718 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
24719 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
24720 uint32_t IncVal = CInc->getZExtValue();
24721 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
24722 if (IncVal != NumBytes)
24723 continue;
24724 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
24725 }
24726
24727 // To avoid cycle construction make sure that neither the load nor the add
24728 // are predecessors to each other or the Vector.
24731 Visited.insert(Addr.getNode());
24732 Worklist.push_back(User);
24733 Worklist.push_back(LD);
24734 Worklist.push_back(Vector.getNode());
24735 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
24736 SDNode::hasPredecessorHelper(User, Visited, Worklist))
24737 continue;
24738
24740 Ops.push_back(LD->getOperand(0)); // Chain
24741 if (IsLaneOp) {
24742 Ops.push_back(Vector); // The vector to be inserted
24743 Ops.push_back(Lane); // The lane to be inserted in the vector
24744 }
24745 Ops.push_back(Addr);
24746 Ops.push_back(Inc);
24747
24748 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
24749 SDVTList SDTys = DAG.getVTList(Tys);
24750 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
24751 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
24752 MemVT,
24753 LoadSDN->getMemOperand());
24754
24755 // Update the uses.
24756 SDValue NewResults[] = {
24757 SDValue(LD, 0), // The result of load
24758 SDValue(UpdN.getNode(), 2) // Chain
24759 };
24760 DCI.CombineTo(LD, NewResults);
24761 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
24762 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
24763
24764 break;
24765 }
24766 return SDValue();
24767}
24768
24769/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
24770/// address translation.
24771static bool performTBISimplification(SDValue Addr,
24773 SelectionDAG &DAG) {
24774 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
24775 // If MTE is enabled, TBI only applies to the top 4 bits.
24776 // Both arm64 and arm64e processes on Darwin may run with MTE enabled.
24777 unsigned NumIgnoreBits =
24778 Subtarget.hasMTE() || Subtarget.isTargetDarwin() ? 4 : 8;
24779 APInt DemandedMask = APInt::getLowBitsSet(64, 64 - NumIgnoreBits);
24780 KnownBits Known;
24782 !DCI.isBeforeLegalizeOps());
24783 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24784 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
24785 DCI.CommitTargetLoweringOpt(TLO);
24786 return true;
24787 }
24788 return false;
24789}
24790
24791static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
24792 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
24793 "Expected STORE dag node in input!");
24794
24795 if (auto Store = dyn_cast<StoreSDNode>(N)) {
24796 if (!Store->isTruncatingStore() || Store->isIndexed())
24797 return SDValue();
24798 SDValue Ext = Store->getValue();
24799 auto ExtOpCode = Ext.getOpcode();
24800 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
24801 ExtOpCode != ISD::ANY_EXTEND)
24802 return SDValue();
24803 SDValue Orig = Ext->getOperand(0);
24804 if (Store->getMemoryVT() != Orig.getValueType())
24805 return SDValue();
24806 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
24807 Store->getBasePtr(), Store->getMemOperand());
24808 }
24809
24810 return SDValue();
24811}
24812
24813// A custom combine to lower load <3 x i8> as the more efficient sequence
24814// below:
24815// ldrb wX, [x0, #2]
24816// ldrh wY, [x0]
24817// orr wX, wY, wX, lsl #16
24818// fmov s0, wX
24819//
24820// Note that an alternative sequence with even fewer (although usually more
24821// complex/expensive) instructions would be:
24822// ld1r.4h { v0 }, [x0], #2
24823// ld1.b { v0 }[2], [x0]
24824//
24825// Generating this sequence unfortunately results in noticeably worse codegen
24826// for code that extends the loaded v3i8, due to legalization breaking vector
24827// shuffle detection in a way that is very difficult to work around.
24828// TODO: Revisit once v3i8 legalization has been improved in general.
24829static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
24830 EVT MemVT = LD->getMemoryVT();
24831 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
24832 LD->getBaseAlign() >= 4)
24833 return SDValue();
24834
24835 SDLoc DL(LD);
24837 SDValue Chain = LD->getChain();
24838 SDValue BasePtr = LD->getBasePtr();
24839 MachineMemOperand *MMO = LD->getMemOperand();
24840 assert(LD->getOffset().isUndef() && "undef offset expected");
24841
24842 // Load 2 x i8, then 1 x i8.
24843 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
24844 TypeSize Offset2 = TypeSize::getFixed(2);
24845 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
24846 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
24847 MF.getMachineMemOperand(MMO, 2, 1));
24848
24849 // Extend to i32.
24850 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
24851 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
24852
24853 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
24854 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
24855 DAG.getConstant(16, DL, MVT::i32));
24856 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
24857 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
24858
24859 // Extract v3i8 again.
24860 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
24861 DAG.getConstant(0, DL, MVT::i64));
24863 ISD::TokenFactor, DL, MVT::Other,
24864 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
24865 return DAG.getMergeValues({Extract, TokenFactor}, DL);
24866}
24867
24868// Perform TBI simplification if supported by the target and try to break up
24869// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
24870// load instructions can be selected.
24871static SDValue performLOADCombine(SDNode *N,
24873 SelectionDAG &DAG,
24874 const AArch64Subtarget *Subtarget) {
24875 if (Subtarget->supportsAddressTopByteIgnored())
24876 performTBISimplification(N->getOperand(1), DCI, DAG);
24877
24879 EVT RegVT = LD->getValueType(0);
24880 EVT MemVT = LD->getMemoryVT();
24881 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24882 SDLoc DL(LD);
24883
24884 // Cast ptr32 and ptr64 pointers to the default address space before a load.
24885 unsigned AddrSpace = LD->getAddressSpace();
24886 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
24887 AddrSpace == ARM64AS::PTR32_UPTR) {
24888 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
24889 if (PtrVT != LD->getBasePtr().getSimpleValueType()) {
24890 SDValue Cast =
24891 DAG.getAddrSpaceCast(DL, PtrVT, LD->getBasePtr(), AddrSpace, 0);
24892 return DAG.getExtLoad(LD->getExtensionType(), DL, RegVT, LD->getChain(),
24893 Cast, LD->getPointerInfo(), MemVT,
24894 LD->getBaseAlign(),
24895 LD->getMemOperand()->getFlags());
24896 }
24897 }
24898
24899 if (LD->isVolatile() || !Subtarget->isLittleEndian())
24900 return SDValue(N, 0);
24901
24902 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
24903 return Res;
24904
24905 if (!LD->isNonTemporal())
24906 return SDValue(N, 0);
24907
24908 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
24909 MemVT.getSizeInBits() % 256 == 0 ||
24910 256 % MemVT.getScalarSizeInBits() != 0)
24911 return SDValue(N, 0);
24912
24913 SDValue Chain = LD->getChain();
24914 SDValue BasePtr = LD->getBasePtr();
24915 SDNodeFlags Flags = LD->getFlags();
24917 SmallVector<SDValue, 4> LoadOpsChain;
24918 // Replace any non temporal load over 256-bit with a series of 256 bit loads
24919 // and a scalar/vector load less than 256. This way we can utilize 256-bit
24920 // loads and reduce the amount of load instructions generated.
24921 MVT NewVT =
24923 256 / MemVT.getVectorElementType().getSizeInBits());
24924 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
24925 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
24926 for (unsigned I = 0; I < Num256Loads; I++) {
24927 unsigned PtrOffset = I * 32;
24928 SDValue NewPtr = DAG.getMemBasePlusOffset(
24929 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
24930 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
24931 SDValue NewLoad = DAG.getLoad(
24932 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
24933 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
24934 LoadOps.push_back(NewLoad);
24935 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
24936 }
24937
24938 // Process remaining bits of the load operation.
24939 // This is done by creating an UNDEF vector to match the size of the
24940 // 256-bit loads and inserting the remaining load to it. We extract the
24941 // original load type at the end using EXTRACT_SUBVECTOR instruction.
24942 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
24943 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
24944 MVT RemainingVT = MVT::getVectorVT(
24946 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
24947 SDValue NewPtr = DAG.getMemBasePlusOffset(
24948 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
24949 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
24950 SDValue RemainingLoad =
24951 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
24952 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
24953 LD->getMemOperand()->getFlags(), LD->getAAInfo());
24954 SDValue UndefVector = DAG.getUNDEF(NewVT);
24955 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
24956 SDValue ExtendedRemainingLoad =
24957 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
24958 {UndefVector, RemainingLoad, InsertIdx});
24959 LoadOps.push_back(ExtendedRemainingLoad);
24960 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
24961 EVT ConcatVT =
24963 LoadOps.size() * NewVT.getVectorNumElements());
24964 SDValue ConcatVectors =
24965 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
24966 // Extract the original vector type size.
24967 SDValue ExtractSubVector =
24968 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
24969 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
24971 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
24972 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
24973}
24974
24975static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = 0) {
24976 EVT VecVT = Op.getValueType();
24977 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
24978 "Need boolean vector type.");
24979
24980 if (Depth > 3)
24982
24983 // We can get the base type from a vector compare or truncate.
24984 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
24985 return Op.getOperand(0).getValueType();
24986
24987 // If an operand is a bool vector, continue looking.
24989 for (SDValue Operand : Op->op_values()) {
24990 if (Operand.getValueType() != VecVT)
24991 continue;
24992
24993 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
24994 if (!BaseVT.isSimple())
24995 BaseVT = OperandVT;
24996 else if (OperandVT != BaseVT)
24998 }
24999
25000 return BaseVT;
25001}
25002
25003// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
25004// iN, we can use a trick that extracts the i^th bit from the i^th element and
25005// then performs a vector add to get a scalar bitmask. This requires that each
25006// element's bits are either all 1 or all 0.
25007static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
25008 SDLoc DL(N);
25009 SDValue ComparisonResult(N, 0);
25010 EVT VecVT = ComparisonResult.getValueType();
25011 assert(VecVT.isVector() && "Must be a vector type");
25012
25013 unsigned NumElts = VecVT.getVectorNumElements();
25014 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
25015 return SDValue();
25016
25017 if (VecVT.getVectorElementType() != MVT::i1 &&
25018 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
25019 return SDValue();
25020
25021 // If we can find the original types to work on instead of a vector of i1,
25022 // we can avoid extend/extract conversion instructions.
25023 if (VecVT.getVectorElementType() == MVT::i1) {
25024 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
25025 if (!VecVT.isSimple()) {
25026 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
25027 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
25028 }
25029 }
25030 VecVT = VecVT.changeVectorElementTypeToInteger();
25031
25032 // Large vectors don't map directly to this conversion, so to avoid too many
25033 // edge cases, we don't apply it here. The conversion will likely still be
25034 // applied later via multiple smaller vectors, whose results are concatenated.
25035 if (VecVT.getSizeInBits() > 128)
25036 return SDValue();
25037
25038 // Ensure that all elements' bits are either 0s or 1s.
25039 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
25040
25041 bool IsLE = DAG.getDataLayout().isLittleEndian();
25042 SmallVector<SDValue, 16> MaskConstants;
25044 VecVT == MVT::v16i8) {
25045 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
25046 // per entry. We split it into two halves, apply the mask, zip the halves to
25047 // create 8x 16-bit values, and the perform the vector reduce.
25048 for (unsigned Half = 0; Half < 2; ++Half) {
25049 for (unsigned I = 0; I < 8; ++I) {
25050 // On big-endian targets, the lane order in sub-byte vector elements
25051 // gets reversed, so we need to flip the bit index.
25052 unsigned MaskBit = IsLE ? (1u << I) : (1u << (7 - I));
25053 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
25054 }
25055 }
25056 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
25057 SDValue RepresentativeBits =
25058 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
25059
25060 SDValue UpperRepresentativeBits =
25061 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
25062 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
25063 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
25064 RepresentativeBits, UpperRepresentativeBits);
25065 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
25066 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
25067 }
25068
25069 // All other vector sizes.
25070 unsigned NumEl = VecVT.getVectorNumElements();
25071 for (unsigned I = 0; I < NumEl; ++I) {
25072 unsigned MaskBit = IsLE ? (1u << I) : (1u << (NumEl - 1 - I));
25073 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
25074 }
25075
25076 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
25077 SDValue RepresentativeBits =
25078 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
25079 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
25080 NumElts, VecVT.getVectorElementType().getSizeInBits()));
25081 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
25082}
25083
25084static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
25085 StoreSDNode *Store) {
25086 if (!Store->isTruncatingStore())
25087 return SDValue();
25088
25089 SDLoc DL(Store);
25090 SDValue VecOp = Store->getValue();
25091 EVT VT = VecOp.getValueType();
25092 EVT MemVT = Store->getMemoryVT();
25093
25094 if (!MemVT.isVector() || !VT.isVector() ||
25095 MemVT.getVectorElementType() != MVT::i1)
25096 return SDValue();
25097
25098 // If we are storing a vector that we are currently building, let
25099 // `scalarizeVectorStore()` handle this more efficiently.
25100 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
25101 return SDValue();
25102
25103 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
25104 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
25105 if (!VectorBits)
25106 return SDValue();
25107
25108 EVT StoreVT =
25110 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
25111 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
25112 Store->getMemOperand());
25113}
25114
25115// Combine store (fp_to_int X) to use vector semantics around the conversion
25116// when NEON is available. This allows us to store the in-vector result directly
25117// without transferring the result into a GPR in the process.
25118static SDValue combineStoreValueFPToInt(StoreSDNode *ST,
25120 SelectionDAG &DAG,
25121 const AArch64Subtarget *Subtarget) {
25122 // Limit to post-legalization in order to avoid peeling truncating stores.
25123 if (DCI.isBeforeLegalize())
25124 return SDValue();
25125 if (!Subtarget->isNeonAvailable())
25126 return SDValue();
25127 // Source operand is already a vector.
25128 SDValue Value = ST->getValue();
25129 if (Value.getValueType().isVector())
25130 return SDValue();
25131
25132 // Look through potential assertions.
25133 while (Value->isAssert())
25134 Value = Value.getOperand(0);
25135
25136 if (Value.getOpcode() != ISD::FP_TO_SINT &&
25137 Value.getOpcode() != ISD::FP_TO_UINT)
25138 return SDValue();
25139 if (!Value->hasOneUse())
25140 return SDValue();
25141
25142 SDValue FPSrc = Value.getOperand(0);
25143 EVT SrcVT = FPSrc.getValueType();
25144 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
25145 return SDValue();
25146
25147 // No support for assignments such as i64 = fp_to_sint i32
25148 EVT VT = Value.getSimpleValueType();
25149 if (VT != SrcVT.changeTypeToInteger())
25150 return SDValue();
25151
25152 // Create a 128-bit element vector to avoid widening. The floating point
25153 // conversion is transformed into a single element conversion via a pattern.
25154 unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();
25155 EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);
25156 EVT VecDstVT = VecSrcVT.changeTypeToInteger();
25157 SDLoc DL(ST);
25158 SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);
25159 SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);
25160
25162 SDValue Extracted =
25163 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);
25164
25165 DCI.CombineTo(ST->getValue().getNode(), Extracted);
25166 return SDValue(ST, 0);
25167}
25168
25169bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
25170 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
25171 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
25172 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
25173}
25174
25175// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
25176static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
25177 const AArch64Subtarget *Subtarget) {
25178 SDValue Value = ST->getValue();
25179 EVT ValueVT = Value.getValueType();
25180
25181 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
25182 Value.getOpcode() != ISD::TRUNCATE ||
25183 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
25184 return SDValue();
25185
25186 assert(ST->getOffset().isUndef() && "undef offset expected");
25187 SDLoc DL(ST);
25188 auto WideVT = EVT::getVectorVT(
25189 *DAG.getContext(),
25190 Value->getOperand(0).getValueType().getVectorElementType(), 4);
25191 SDValue UndefVector = DAG.getUNDEF(WideVT);
25192 SDValue WideTrunc = DAG.getNode(
25193 ISD::INSERT_SUBVECTOR, DL, WideVT,
25194 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
25195 SDValue Cast = DAG.getNode(
25196 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
25197 WideTrunc);
25198
25200 SDValue Chain = ST->getChain();
25201 MachineMemOperand *MMO = ST->getMemOperand();
25202 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
25203 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
25204 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
25205 TypeSize Offset2 = TypeSize::getFixed(2);
25206 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
25207 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
25208
25209 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
25210 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
25211 TypeSize Offset1 = TypeSize::getFixed(1);
25212 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
25213 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
25214
25215 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
25216 DAG.getConstant(0, DL, MVT::i64));
25217 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
25218 MF.getMachineMemOperand(MMO, 0, 1));
25219 return Chain;
25220}
25221
25222static unsigned getFPSubregForVT(EVT VT) {
25223 assert(VT.isSimple() && "Expected simple VT");
25224 switch (VT.getSimpleVT().SimpleTy) {
25225 case MVT::aarch64mfp8:
25226 return AArch64::bsub;
25227 case MVT::f16:
25228 return AArch64::hsub;
25229 case MVT::f32:
25230 return AArch64::ssub;
25231 case MVT::f64:
25232 return AArch64::dsub;
25233 default:
25234 llvm_unreachable("Unexpected VT!");
25235 }
25236}
25237
25238static SDValue performSTORECombine(SDNode *N,
25240 SelectionDAG &DAG,
25241 const AArch64Subtarget *Subtarget) {
25243 SDValue Chain = ST->getChain();
25244 SDValue Value = ST->getValue();
25245 SDValue Ptr = ST->getBasePtr();
25246 EVT ValueVT = Value.getValueType();
25247 EVT MemVT = ST->getMemoryVT();
25248 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25249 SDLoc DL(ST);
25250
25251 if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))
25252 return Res;
25253
25254 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
25255 EVT EltVT = VT.getVectorElementType();
25256 return EltVT == MVT::f32 || EltVT == MVT::f64;
25257 };
25258
25259 // Cast ptr32 and ptr64 pointers to the default address space before a store.
25260 unsigned AddrSpace = ST->getAddressSpace();
25261 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
25262 AddrSpace == ARM64AS::PTR32_UPTR) {
25263 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25264 if (PtrVT != Ptr.getSimpleValueType()) {
25265 SDValue Cast = DAG.getAddrSpaceCast(DL, PtrVT, Ptr, AddrSpace, 0);
25266 return DAG.getStore(Chain, DL, Value, Cast, ST->getPointerInfo(),
25267 ST->getBaseAlign(), ST->getMemOperand()->getFlags(),
25268 ST->getAAInfo());
25269 }
25270 }
25271
25272 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
25273 return Res;
25274
25275 // If this is an FP_ROUND followed by a store, fold this into a truncating
25276 // store. We can do this even if this is already a truncstore.
25277 // We purposefully don't care about legality of the nodes here as we know
25278 // they can be split down into something legal.
25279 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
25280 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
25281 Subtarget->useSVEForFixedLengthVectors() &&
25282 ValueVT.isFixedLengthVector() &&
25283 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
25284 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
25285 return DAG.getTruncStore(Chain, DL, Value.getOperand(0), Ptr, MemVT,
25286 ST->getMemOperand());
25287
25288 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
25289 return Split;
25290
25291 if (Subtarget->supportsAddressTopByteIgnored() &&
25292 performTBISimplification(N->getOperand(2), DCI, DAG))
25293 return SDValue(N, 0);
25294
25295 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
25296 return Store;
25297
25298 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
25299 return Store;
25300
25301 if (ST->isTruncatingStore() &&
25302 isHalvingTruncateOfLegalScalableType(ValueVT, MemVT)) {
25303 if (SDValue Rshrnb =
25304 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
25305 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
25306 MemVT, ST->getMemOperand());
25307 }
25308 }
25309
25310 // This is an integer vector_extract_elt followed by a (possibly truncating)
25311 // store. We may be able to replace this with a store of an FP subregister.
25312 if (DCI.isAfterLegalizeDAG() && ST->isUnindexed() &&
25313 Value.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
25314
25315 SDValue Vector = Value.getOperand(0);
25316 SDValue ExtIdx = Value.getOperand(1);
25317 EVT VectorVT = Vector.getValueType();
25318 EVT ElemVT = VectorVT.getVectorElementType();
25319
25320 if (!ValueVT.isInteger())
25321 return SDValue();
25322
25323 // Propagate zero constants (applying this fold may miss optimizations).
25325 SDValue ZeroElt = DAG.getConstant(0, DL, ValueVT);
25326 DAG.ReplaceAllUsesWith(Value, ZeroElt);
25327 return SDValue();
25328 }
25329
25330 if (ValueVT != MemVT && !ST->isTruncatingStore())
25331 return SDValue();
25332
25333 // This could generate an additional extract if the index is non-zero and
25334 // the extracted value has multiple uses.
25335 auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);
25336 if ((!ExtCst || !ExtCst->isZero()) && !Value.hasOneUse())
25337 return SDValue();
25338
25339 // These can lower to st1, which is preferable if we're unlikely to fold the
25340 // addressing into the store.
25341 if (Subtarget->isNeonAvailable() && ElemVT == MemVT &&
25342 (VectorVT.is64BitVector() || VectorVT.is128BitVector()) && ExtCst &&
25343 !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD)
25344 return SDValue();
25345
25346 if (MemVT == MVT::i64 || MemVT == MVT::i32) {
25347 // Heuristic: If there are other users of w/x integer scalars extracted
25348 // from this vector that won't fold into the store -- abandon folding.
25349 // Applying this fold may disrupt paired stores.
25350 for (const auto &Use : Vector->uses()) {
25351 if (Use.getResNo() != Vector.getResNo())
25352 continue;
25353 const SDNode *User = Use.getUser();
25354 if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
25355 (!User->hasOneUse() ||
25356 (*User->user_begin())->getOpcode() != ISD::STORE))
25357 return SDValue();
25358 }
25359 }
25360
25361 SDValue ExtVector = Vector;
25362 if (!ExtCst || !ExtCst->isZero()) {
25363 // Handle extracting from lanes != 0.
25365 Value.getValueType(), Vector, ExtIdx);
25367 ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT,
25368 DAG.getUNDEF(VectorVT), Ext, Zero);
25369 }
25370
25371 EVT FPMemVT = MemVT == MVT::i8
25372 ? MVT::aarch64mfp8
25374 SDValue FPSubreg = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
25375 FPMemVT, ExtVector);
25376
25377 return DAG.getStore(ST->getChain(), DL, FPSubreg, ST->getBasePtr(),
25378 ST->getMemOperand());
25379 }
25380
25381 return SDValue();
25382}
25383
25384static bool
25385isSequentialConcatOfVectorInterleave(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
25386 if (N->getOpcode() != ISD::CONCAT_VECTORS)
25387 return false;
25388
25389 unsigned NumParts = N->getNumOperands();
25390
25391 // We should be concatenating each sequential result from a
25392 // VECTOR_INTERLEAVE.
25393 SDNode *InterleaveOp = N->getOperand(0).getNode();
25394 if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE ||
25395 InterleaveOp->getNumOperands() != NumParts)
25396 return false;
25397
25398 for (unsigned I = 0; I < NumParts; I++)
25399 if (N->getOperand(I) != SDValue(InterleaveOp, I))
25400 return false;
25401
25402 Ops.append(InterleaveOp->op_begin(), InterleaveOp->op_end());
25403 return true;
25404}
25405
25406static SDValue getNarrowMaskForInterleavedOps(SelectionDAG &DAG, SDLoc &DL,
25407 SDValue WideMask,
25408 unsigned RequiredNumParts) {
25409 if (WideMask->getOpcode() == ISD::CONCAT_VECTORS) {
25410 SmallVector<SDValue, 4> MaskInterleaveOps;
25411 if (!isSequentialConcatOfVectorInterleave(WideMask.getNode(),
25412 MaskInterleaveOps))
25413 return SDValue();
25414
25415 if (MaskInterleaveOps.size() != RequiredNumParts)
25416 return SDValue();
25417
25418 // Make sure the inputs to the vector interleave are identical.
25419 if (!llvm::all_equal(MaskInterleaveOps))
25420 return SDValue();
25421
25422 return MaskInterleaveOps[0];
25423 }
25424
25425 if (WideMask->getOpcode() != ISD::SPLAT_VECTOR)
25426 return SDValue();
25427
25429 assert(EC.isKnownMultipleOf(RequiredNumParts) &&
25430 "Expected element count divisible by number of parts");
25431 EC = EC.divideCoefficientBy(RequiredNumParts);
25432 return DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC),
25433 WideMask->getOperand(0));
25434}
25435
25436static SDValue performInterleavedMaskedStoreCombine(
25438 if (!DCI.isBeforeLegalize())
25439 return SDValue();
25440
25442 SDValue WideValue = MST->getValue();
25443
25444 // Bail out if the stored value has an unexpected number of uses, since we'll
25445 // have to perform manual interleaving and may as well just use normal masked
25446 // stores. Also, discard masked stores that are truncating or indexed.
25447 if (!WideValue.hasOneUse() || !ISD::isNormalMaskedStore(MST) ||
25448 !MST->isSimple() || !MST->getOffset().isUndef())
25449 return SDValue();
25450
25451 SmallVector<SDValue, 4> ValueInterleaveOps;
25452 if (!isSequentialConcatOfVectorInterleave(WideValue.getNode(),
25453 ValueInterleaveOps))
25454 return SDValue();
25455
25456 unsigned NumParts = ValueInterleaveOps.size();
25457 if (NumParts != 2 && NumParts != 4)
25458 return SDValue();
25459
25460 // At the moment we're unlikely to see a fixed-width vector interleave as
25461 // we usually generate shuffles instead.
25462 EVT SubVecTy = ValueInterleaveOps[0].getValueType();
25463 if (!SubVecTy.isScalableVT() ||
25464 SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
25465 !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
25466 return SDValue();
25467
25468 SDLoc DL(N);
25469 SDValue NarrowMask =
25470 getNarrowMaskForInterleavedOps(DAG, DL, MST->getMask(), NumParts);
25471 if (!NarrowMask)
25472 return SDValue();
25473
25474 const Intrinsic::ID IID =
25475 NumParts == 2 ? Intrinsic::aarch64_sve_st2 : Intrinsic::aarch64_sve_st4;
25476 SmallVector<SDValue, 8> NewStOps;
25477 NewStOps.append({MST->getChain(), DAG.getConstant(IID, DL, MVT::i32)});
25478 NewStOps.append(ValueInterleaveOps);
25479 NewStOps.append({NarrowMask, MST->getBasePtr()});
25480 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, NewStOps);
25481}
25482
25483static SDValue performMSTORECombine(SDNode *N,
25485 SelectionDAG &DAG,
25486 const AArch64Subtarget *Subtarget) {
25488 SDValue Value = MST->getValue();
25489 SDValue Mask = MST->getMask();
25490 SDLoc DL(N);
25491
25492 if (SDValue Res = performInterleavedMaskedStoreCombine(N, DCI, DAG))
25493 return Res;
25494
25495 // If this is a UZP1 followed by a masked store, fold this into a masked
25496 // truncating store. We can do this even if this is already a masked
25497 // truncstore.
25498 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
25499 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
25500 Value.getValueType().isInteger()) {
25501 Value = Value.getOperand(0);
25502 if (Value.getOpcode() == ISD::BITCAST) {
25503 EVT HalfVT =
25504 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
25505 EVT InVT = Value.getOperand(0).getValueType();
25506
25507 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
25508 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
25509 unsigned PgPattern = Mask->getConstantOperandVal(0);
25510
25511 // Ensure we can double the size of the predicate pattern
25512 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
25513 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
25514 MinSVESize) {
25515 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
25516 PgPattern);
25517 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
25518 MST->getBasePtr(), MST->getOffset(), Mask,
25519 MST->getMemoryVT(), MST->getMemOperand(),
25520 MST->getAddressingMode(),
25521 /*IsTruncating=*/true);
25522 }
25523 }
25524 }
25525 }
25526
25527 if (MST->isTruncatingStore()) {
25528 EVT ValueVT = Value->getValueType(0);
25529 EVT MemVT = MST->getMemoryVT();
25530 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
25531 return SDValue();
25532 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
25533 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
25534 MST->getOffset(), MST->getMask(),
25535 MST->getMemoryVT(), MST->getMemOperand(),
25536 MST->getAddressingMode(), true);
25537 }
25538 }
25539
25540 return SDValue();
25541}
25542
25543/// \return true if part of the index was folded into the Base.
25544static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
25545 SDLoc DL, SelectionDAG &DAG) {
25546 // This function assumes a vector of i64 indices.
25547 EVT IndexVT = Index.getValueType();
25548 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
25549 return false;
25550
25551 // Simplify:
25552 // BasePtr = Ptr
25553 // Index = X + splat(Offset)
25554 // ->
25555 // BasePtr = Ptr + Offset * scale.
25556 // Index = X
25557 if (Index.getOpcode() == ISD::ADD) {
25558 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
25559 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
25560 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
25561 Index = Index.getOperand(0);
25562 return true;
25563 }
25564 }
25565
25566 // Simplify:
25567 // BasePtr = Ptr
25568 // Index = (X + splat(Offset)) << splat(Shift)
25569 // ->
25570 // BasePtr = Ptr + (Offset << Shift) * scale)
25571 // Index = X << splat(shift)
25572 if (Index.getOpcode() == ISD::SHL &&
25573 Index.getOperand(0).getOpcode() == ISD::ADD) {
25574 SDValue Add = Index.getOperand(0);
25575 SDValue ShiftOp = Index.getOperand(1);
25576 SDValue OffsetOp = Add.getOperand(1);
25577 if (auto Shift = DAG.getSplatValue(ShiftOp))
25578 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
25579 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
25580 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
25581 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
25582 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
25583 Add.getOperand(0), ShiftOp);
25584 return true;
25585 }
25586 }
25587
25588 return false;
25589}
25590
25591// Analyse the specified address returning true if a more optimal addressing
25592// mode is available. When returning true all parameters are updated to reflect
25593// their recommended values.
25595 SDValue &BasePtr, SDValue &Index,
25596 SelectionDAG &DAG) {
25597 // Try to iteratively fold parts of the index into the base pointer to
25598 // simplify the index as much as possible.
25599 bool Changed = false;
25600 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
25601 Changed = true;
25602
25603 // Only consider element types that are pointer sized as smaller types can
25604 // be easily promoted.
25605 EVT IndexVT = Index.getValueType();
25606 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
25607 return Changed;
25608
25609 // Can indices be trivially shrunk?
25610 EVT DataVT = N->getOperand(1).getValueType();
25611 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
25612 // will later be re-extended to 64 bits in legalization
25613 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
25614 return Changed;
25615 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
25616 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
25617 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
25618 return true;
25619 }
25620
25621 // Match:
25622 // Index = step(const)
25623 int64_t Stride = 0;
25624 if (Index.getOpcode() == ISD::STEP_VECTOR) {
25625 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
25626 }
25627 // Match:
25628 // Index = step(const) << shift(const)
25629 else if (Index.getOpcode() == ISD::SHL &&
25630 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
25631 SDValue RHS = Index.getOperand(1);
25632 if (auto *Shift =
25634 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
25635 Stride = Step << Shift->getZExtValue();
25636 }
25637 }
25638
25639 // Return early because no supported pattern is found.
25640 if (Stride == 0)
25641 return Changed;
25642
25643 if (Stride < std::numeric_limits<int32_t>::min() ||
25644 Stride > std::numeric_limits<int32_t>::max())
25645 return Changed;
25646
25647 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
25648 unsigned MaxVScale =
25650 int64_t LastElementOffset =
25651 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
25652
25653 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
25654 LastElementOffset > std::numeric_limits<int32_t>::max())
25655 return Changed;
25656
25657 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
25658 // Stride does not scale explicitly by 'Scale', because it happens in
25659 // the gather/scatter addressing mode.
25660 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride, true));
25661 return true;
25662}
25663
25666 if (!DCI.isBeforeLegalize())
25667 return SDValue();
25669
25670 SDLoc DL(MGS);
25671 SDValue Chain = MGS->getChain();
25672 SDValue Scale = MGS->getScale();
25673 SDValue Index = MGS->getIndex();
25674 SDValue Mask = MGS->getMask();
25675 SDValue BasePtr = MGS->getBasePtr();
25676 ISD::MemIndexType IndexType = MGS->getIndexType();
25677
25678 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
25679 return SDValue();
25680
25681 // Here we catch such cases early and change MGATHER's IndexType to allow
25682 // the use of an Index that's more legalisation friendly.
25683 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
25684 SDValue PassThru = MGT->getPassThru();
25685 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
25686 return DAG.getMaskedGather(
25687 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
25688 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
25689 }
25690 if (auto *MSC = dyn_cast<MaskedScatterSDNode>(MGS)) {
25691 SDValue Data = MSC->getValue();
25692 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
25693 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
25694 DL, Ops, MSC->getMemOperand(), IndexType,
25695 MSC->isTruncatingStore());
25696 }
25697 auto *HG = cast<MaskedHistogramSDNode>(MGS);
25698 SDValue Ops[] = {Chain, HG->getInc(), Mask, BasePtr,
25699 Index, Scale, HG->getIntID()};
25700 return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),
25701 DL, Ops, HG->getMemOperand(), IndexType);
25702}
25703
25704/// Target-specific DAG combine function for NEON load/store intrinsics
25705/// to merge base address updates.
25708 SelectionDAG &DAG) {
25709 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
25710 return SDValue();
25711
25712 unsigned AddrOpIdx = N->getNumOperands() - 1;
25713 SDValue Addr = N->getOperand(AddrOpIdx);
25714
25715 // Search for a use of the address operand that is an increment.
25716 for (SDUse &Use : Addr->uses()) {
25717 SDNode *User = Use.getUser();
25718 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
25719 continue;
25720
25721 // Check that the add is independent of the load/store. Otherwise, folding
25722 // it would create a cycle.
25725 Visited.insert(Addr.getNode());
25726 Worklist.push_back(N);
25727 Worklist.push_back(User);
25728 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
25729 SDNode::hasPredecessorHelper(User, Visited, Worklist))
25730 continue;
25731
25732 // Find the new opcode for the updating load/store.
25733 bool IsStore = false;
25734 bool IsLaneOp = false;
25735 bool IsDupOp = false;
25736 unsigned NewOpc = 0;
25737 unsigned NumVecs = 0;
25738 unsigned IntNo = N->getConstantOperandVal(1);
25739 switch (IntNo) {
25740 default: llvm_unreachable("unexpected intrinsic for Neon base update");
25741 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
25742 NumVecs = 2; break;
25743 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
25744 NumVecs = 3; break;
25745 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
25746 NumVecs = 4; break;
25747 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
25748 NumVecs = 2; IsStore = true; break;
25749 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
25750 NumVecs = 3; IsStore = true; break;
25751 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
25752 NumVecs = 4; IsStore = true; break;
25753 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
25754 NumVecs = 2; break;
25755 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
25756 NumVecs = 3; break;
25757 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
25758 NumVecs = 4; break;
25759 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
25760 NumVecs = 2; IsStore = true; break;
25761 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
25762 NumVecs = 3; IsStore = true; break;
25763 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
25764 NumVecs = 4; IsStore = true; break;
25765 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
25766 NumVecs = 2; IsDupOp = true; break;
25767 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
25768 NumVecs = 3; IsDupOp = true; break;
25769 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
25770 NumVecs = 4; IsDupOp = true; break;
25771 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
25772 NumVecs = 2; IsLaneOp = true; break;
25773 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
25774 NumVecs = 3; IsLaneOp = true; break;
25775 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
25776 NumVecs = 4; IsLaneOp = true; break;
25777 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
25778 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
25779 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
25780 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
25781 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
25782 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
25783 }
25784
25785 EVT VecTy;
25786 if (IsStore)
25787 VecTy = N->getOperand(2).getValueType();
25788 else
25789 VecTy = N->getValueType(0);
25790
25791 // If the increment is a constant, it must match the memory ref size.
25792 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
25793 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
25794 uint32_t IncVal = CInc->getZExtValue();
25795 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
25796 if (IsLaneOp || IsDupOp)
25797 NumBytes /= VecTy.getVectorNumElements();
25798 if (IncVal != NumBytes)
25799 continue;
25800 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
25801 }
25803 Ops.push_back(N->getOperand(0)); // Incoming chain
25804 // Load lane and store have vector list as input.
25805 if (IsLaneOp || IsStore)
25806 for (unsigned i = 2; i < AddrOpIdx; ++i)
25807 Ops.push_back(N->getOperand(i));
25808 Ops.push_back(Addr); // Base register
25809 Ops.push_back(Inc);
25810
25811 // Return Types.
25812 EVT Tys[6];
25813 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
25814 unsigned n;
25815 for (n = 0; n < NumResultVecs; ++n)
25816 Tys[n] = VecTy;
25817 Tys[n++] = MVT::i64; // Type of write back register
25818 Tys[n] = MVT::Other; // Type of the chain
25819 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
25820
25822 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
25823 MemInt->getMemoryVT(),
25824 MemInt->getMemOperand());
25825
25826 // Update the uses.
25827 std::vector<SDValue> NewResults;
25828 for (unsigned i = 0; i < NumResultVecs; ++i) {
25829 NewResults.push_back(SDValue(UpdN.getNode(), i));
25830 }
25831 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
25832 DCI.CombineTo(N, NewResults);
25833 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
25834
25835 break;
25836 }
25837 return SDValue();
25838}
25839
25840// Checks to see if the value is the prescribed width and returns information
25841// about its extension mode.
25842static
25843bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
25844 ExtType = ISD::NON_EXTLOAD;
25845 switch(V.getNode()->getOpcode()) {
25846 default:
25847 return false;
25848 case ISD::LOAD: {
25849 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
25850 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
25851 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
25852 ExtType = LoadNode->getExtensionType();
25853 return true;
25854 }
25855 return false;
25856 }
25857 case ISD::AssertSext: {
25858 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
25859 if ((TypeNode->getVT() == MVT::i8 && width == 8)
25860 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
25861 ExtType = ISD::SEXTLOAD;
25862 return true;
25863 }
25864 return false;
25865 }
25866 case ISD::AssertZext: {
25867 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
25868 if ((TypeNode->getVT() == MVT::i8 && width == 8)
25869 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
25870 ExtType = ISD::ZEXTLOAD;
25871 return true;
25872 }
25873 return false;
25874 }
25875 case ISD::Constant:
25876 case ISD::TargetConstant: {
25877 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
25878 1LL << (width - 1);
25879 }
25880 }
25881
25882 return true;
25883}
25884
25885// This function does a whole lot of voodoo to determine if the tests are
25886// equivalent without and with a mask. Essentially what happens is that given a
25887// DAG resembling:
25888//
25889// +-------------+ +-------------+ +-------------+ +-------------+
25890// | Input | | AddConstant | | CompConstant| | CC |
25891// +-------------+ +-------------+ +-------------+ +-------------+
25892// | | | |
25893// V V | +----------+
25894// +-------------+ +----+ | |
25895// | ADD | |0xff| | |
25896// +-------------+ +----+ | |
25897// | | | |
25898// V V | |
25899// +-------------+ | |
25900// | AND | | |
25901// +-------------+ | |
25902// | | |
25903// +-----+ | |
25904// | | |
25905// V V V
25906// +-------------+
25907// | CMP |
25908// +-------------+
25909//
25910// The AND node may be safely removed for some combinations of inputs. In
25911// particular we need to take into account the extension type of the Input,
25912// the exact values of AddConstant, CompConstant, and CC, along with the nominal
25913// width of the input (this can work for any width inputs, the above graph is
25914// specific to 8 bits.
25915//
25916// The specific equations were worked out by generating output tables for each
25917// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
25918// problem was simplified by working with 4 bit inputs, which means we only
25919// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
25920// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
25921// patterns present in both extensions (0,7). For every distinct set of
25922// AddConstant and CompConstants bit patterns we can consider the masked and
25923// unmasked versions to be equivalent if the result of this function is true for
25924// all 16 distinct bit patterns of for the current extension type of Input (w0).
25925//
25926// sub w8, w0, w1
25927// and w10, w8, #0x0f
25928// cmp w8, w2
25929// cset w9, AArch64CC
25930// cmp w10, w2
25931// cset w11, AArch64CC
25932// cmp w9, w11
25933// cset w0, eq
25934// ret
25935//
25936// Since the above function shows when the outputs are equivalent it defines
25937// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
25938// would be expensive to run during compiles. The equations below were written
25939// in a test harness that confirmed they gave equivalent outputs to the above
25940// for all inputs function, so they can be used determine if the removal is
25941// legal instead.
25942//
25943// isEquivalentMaskless() is the code for testing if the AND can be removed
25944// factored out of the DAG recognition as the DAG can take several forms.
25945
25946static bool isEquivalentMaskless(unsigned CC, unsigned width,
25947 ISD::LoadExtType ExtType, int AddConstant,
25948 int CompConstant) {
25949 // By being careful about our equations and only writing the in term
25950 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
25951 // make them generally applicable to all bit widths.
25952 int MaxUInt = (1 << width);
25953
25954 // For the purposes of these comparisons sign extending the type is
25955 // equivalent to zero extending the add and displacing it by half the integer
25956 // width. Provided we are careful and make sure our equations are valid over
25957 // the whole range we can just adjust the input and avoid writing equations
25958 // for sign extended inputs.
25959 if (ExtType == ISD::SEXTLOAD)
25960 AddConstant -= (1 << (width-1));
25961
25962 switch(CC) {
25963 case AArch64CC::LE:
25964 case AArch64CC::GT:
25965 if ((AddConstant == 0) ||
25966 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
25967 (AddConstant >= 0 && CompConstant < 0) ||
25968 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
25969 return true;
25970 break;
25971 case AArch64CC::LT:
25972 case AArch64CC::GE:
25973 if ((AddConstant == 0) ||
25974 (AddConstant >= 0 && CompConstant <= 0) ||
25975 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
25976 return true;
25977 break;
25978 case AArch64CC::HI:
25979 case AArch64CC::LS:
25980 if ((AddConstant >= 0 && CompConstant < 0) ||
25981 (AddConstant <= 0 && CompConstant >= -1 &&
25982 CompConstant < AddConstant + MaxUInt))
25983 return true;
25984 break;
25985 case AArch64CC::PL:
25986 case AArch64CC::MI:
25987 if ((AddConstant == 0) ||
25988 (AddConstant > 0 && CompConstant <= 0) ||
25989 (AddConstant < 0 && CompConstant <= AddConstant))
25990 return true;
25991 break;
25992 case AArch64CC::LO:
25993 case AArch64CC::HS:
25994 if ((AddConstant >= 0 && CompConstant <= 0) ||
25995 (AddConstant <= 0 && CompConstant >= 0 &&
25996 CompConstant <= AddConstant + MaxUInt))
25997 return true;
25998 break;
25999 case AArch64CC::EQ:
26000 case AArch64CC::NE:
26001 if ((AddConstant > 0 && CompConstant < 0) ||
26002 (AddConstant < 0 && CompConstant >= 0 &&
26003 CompConstant < AddConstant + MaxUInt) ||
26004 (AddConstant >= 0 && CompConstant >= 0 &&
26005 CompConstant >= AddConstant) ||
26006 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
26007 return true;
26008 break;
26009 case AArch64CC::VS:
26010 case AArch64CC::VC:
26011 case AArch64CC::AL:
26012 case AArch64CC::NV:
26013 return true;
26014 case AArch64CC::Invalid:
26015 break;
26016 }
26017
26018 return false;
26019}
26020
26021// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
26022// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
26024 SDNode *AndNode, SelectionDAG &DAG,
26025 unsigned CCIndex, unsigned CmpIndex,
26026 unsigned CC) {
26027 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
26028 if (!SubsC)
26029 return SDValue();
26030
26031 APInt SubsAP = SubsC->getAPIntValue();
26032 if (CC == AArch64CC::HI) {
26033 if (!SubsAP.isMask())
26034 return SDValue();
26035 } else if (CC == AArch64CC::LO) {
26036 if (!SubsAP.isPowerOf2())
26037 return SDValue();
26038 } else
26039 return SDValue();
26040
26042 if (!AndC)
26043 return SDValue();
26044
26045 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
26046
26047 SDLoc DL(N);
26048 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
26049 SDValue ANDS = DAG.getNode(
26050 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
26051 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
26052 SDValue AArch64_CC =
26054 N->getOperand(CCIndex)->getValueType(0));
26055
26056 // For now, only performCSELCombine and performBRCONDCombine call this
26057 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
26058 // operands. So just init the ops direct to simplify the code. If we have some
26059 // other case with different CCIndex, CmpIndex, we need to use for loop to
26060 // rewrite the code here.
26061 // TODO: Do we need to assert number of operand is 4 here?
26062 assert((CCIndex == 2 && CmpIndex == 3) &&
26063 "Expected CCIndex to be 2 and CmpIndex to be 3.");
26064 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
26065 ANDS.getValue(1)};
26066 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
26067}
26068
26069static
26072 SelectionDAG &DAG, unsigned CCIndex,
26073 unsigned CmpIndex) {
26074 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
26075 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
26076 unsigned CondOpcode = SubsNode->getOpcode();
26077
26078 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
26079 !SubsNode->hasOneUse())
26080 return SDValue();
26081
26082 // There is a SUBS feeding this condition. Is it fed by a mask we can
26083 // use?
26084
26085 SDNode *AndNode = SubsNode->getOperand(0).getNode();
26086 unsigned MaskBits = 0;
26087
26088 if (AndNode->getOpcode() != ISD::AND)
26089 return SDValue();
26090
26091 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
26092 CmpIndex, CC))
26093 return Val;
26094
26095 // X & M ?= C --> (C << clz(M)) ?= (X << clz(M)) where M is a non-empty
26096 // sequence of ones starting at the least significant bit with the remainder
26097 // zero and C is a constant s.t. (C & ~M) == 0 that cannot be materialised
26098 // into a SUBS (immediate). The transformed form can be matched into a SUBS
26099 // (shifted register).
26100 if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && AndNode->hasOneUse() &&
26101 isa<ConstantSDNode>(AndNode->getOperand(1)) &&
26102 isa<ConstantSDNode>(SubsNode->getOperand(1))) {
26103 SDValue X = AndNode->getOperand(0);
26104 APInt M = AndNode->getConstantOperandAPInt(1);
26105 APInt C = SubsNode->getConstantOperandAPInt(1);
26106
26107 if (M.isMask() && C.isSubsetOf(M) && !isLegalArithImmed(C.getZExtValue())) {
26108 SDLoc DL(SubsNode);
26109 EVT VT = SubsNode->getValueType(0);
26110 unsigned ShiftAmt = M.countl_zero();
26111 SDValue ShiftedX = DAG.getNode(
26112 ISD::SHL, DL, VT, X, DAG.getShiftAmountConstant(ShiftAmt, VT, DL));
26113 SDValue ShiftedC = DAG.getConstant(C << ShiftAmt, DL, VT);
26114 SDValue NewSubs = DAG.getNode(AArch64ISD::SUBS, DL, SubsNode->getVTList(),
26115 ShiftedC, ShiftedX);
26116 DCI.CombineTo(SubsNode, NewSubs, NewSubs.getValue(1));
26117 return SDValue(N, 0);
26118 }
26119 }
26120
26121 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
26122 uint32_t CNV = CN->getZExtValue();
26123 if (CNV == 255)
26124 MaskBits = 8;
26125 else if (CNV == 65535)
26126 MaskBits = 16;
26127 }
26128
26129 if (!MaskBits)
26130 return SDValue();
26131
26132 SDValue AddValue = AndNode->getOperand(0);
26133
26134 if (AddValue.getOpcode() != ISD::ADD)
26135 return SDValue();
26136
26137 // The basic dag structure is correct, grab the inputs and validate them.
26138
26139 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
26140 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
26141 SDValue SubsInputValue = SubsNode->getOperand(1);
26142
26143 // The mask is present and the provenance of all the values is a smaller type,
26144 // lets see if the mask is superfluous.
26145
26146 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
26147 !isa<ConstantSDNode>(SubsInputValue.getNode()))
26148 return SDValue();
26149
26150 ISD::LoadExtType ExtType;
26151
26152 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
26153 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
26154 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
26155 return SDValue();
26156
26157 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
26158 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
26159 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
26160 return SDValue();
26161
26162 // The AND is not necessary, remove it.
26163
26164 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
26165 SubsNode->getValueType(1));
26166 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
26167
26168 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
26169 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
26170
26171 return SDValue(N, 0);
26172}
26173
26174// Optimize compare with zero and branch.
26177 SelectionDAG &DAG) {
26179 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
26180 // will not be produced, as they are conditional branch instructions that do
26181 // not set flags.
26182 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
26183 return SDValue();
26184
26185 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
26186 N = NV.getNode();
26187 SDValue Chain = N->getOperand(0);
26188 SDValue Dest = N->getOperand(1);
26189 SDValue CCVal = N->getOperand(2);
26190 SDValue Cmp = N->getOperand(3);
26191
26192 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
26193 unsigned CC = CCVal->getAsZExtVal();
26194 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
26195 return SDValue();
26196
26197 // Fold away brcond(NE, cmp(csel(1, 0, CC, Cmp), 1)) -> brcond(~CC, Cmp)
26198 if (isCMP(Cmp) && CC == AArch64CC::NE && isOneConstant(Cmp.getOperand(1))) {
26199 SDValue CSel = Cmp.getOperand(0);
26200 auto CSelCC = getCSETCondCode(CSel);
26201 if (CSelCC) {
26202 SDLoc DL(N);
26203 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), Chain, Dest,
26204 getCondCode(DAG, getInvertedCondCode(*CSelCC)),
26205 CSel.getOperand(3));
26206 }
26207 }
26208
26209 unsigned CmpOpc = Cmp.getOpcode();
26210 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
26211 return SDValue();
26212
26213 // Only attempt folding if there is only one use of the flag and no use of the
26214 // value.
26215 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
26216 return SDValue();
26217
26218 SDValue LHS = Cmp.getOperand(0);
26219 SDValue RHS = Cmp.getOperand(1);
26220
26221 assert(LHS.getValueType() == RHS.getValueType() &&
26222 "Expected the value type to be the same for both operands!");
26223 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
26224 return SDValue();
26225
26226 if (isNullConstant(LHS))
26227 std::swap(LHS, RHS);
26228
26229 if (!isNullConstant(RHS))
26230 return SDValue();
26231
26232 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
26233 LHS.getOpcode() == ISD::SRL)
26234 return SDValue();
26235
26236 // Fold the compare into the branch instruction.
26237 SDValue BR;
26238 if (CC == AArch64CC::EQ)
26239 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
26240 else
26241 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
26242
26243 // Do not add new nodes to DAG combiner worklist.
26244 DCI.CombineTo(N, BR, false);
26245
26246 return SDValue();
26247}
26248
26250 unsigned CC = N->getConstantOperandVal(2);
26251 SDValue SUBS = N->getOperand(3);
26252 SDValue Zero, CTTZ;
26253
26254 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
26255 Zero = N->getOperand(0);
26256 CTTZ = N->getOperand(1);
26257 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
26258 Zero = N->getOperand(1);
26259 CTTZ = N->getOperand(0);
26260 } else
26261 return SDValue();
26262
26263 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
26264 (CTTZ.getOpcode() == ISD::TRUNCATE &&
26265 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
26266 return SDValue();
26267
26268 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
26269 "Illegal type in CTTZ folding");
26270
26271 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
26272 return SDValue();
26273
26274 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
26275 ? CTTZ.getOperand(0).getOperand(0)
26276 : CTTZ.getOperand(0);
26277
26278 if (X != SUBS.getOperand(0))
26279 return SDValue();
26280
26281 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
26282 ? CTTZ.getOperand(0).getValueSizeInBits()
26283 : CTTZ.getValueSizeInBits();
26284 SDValue BitWidthMinusOne =
26285 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
26286 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
26287 BitWidthMinusOne);
26288}
26289
26290// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
26291// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
26292// Where x and y are constants and x != y
26293
26294// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
26295// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
26296// Where x and y are constants and x != y
26298 SDValue L = Op->getOperand(0);
26299 SDValue R = Op->getOperand(1);
26300 AArch64CC::CondCode OpCC =
26301 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
26302
26303 SDValue OpCmp = Op->getOperand(3);
26304 if (!isCMP(OpCmp))
26305 return SDValue();
26306
26307 SDValue CmpLHS = OpCmp.getOperand(0);
26308 SDValue CmpRHS = OpCmp.getOperand(1);
26309
26310 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
26311 std::swap(CmpLHS, CmpRHS);
26312 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
26313 return SDValue();
26314
26315 SDValue X = CmpLHS->getOperand(0);
26316 SDValue Y = CmpLHS->getOperand(1);
26317 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
26318 return SDValue();
26319 }
26320
26321 // If one of the constant is opaque constant, x,y sdnode is still different
26322 // but the real value maybe the same. So check APInt here to make sure the
26323 // code is correct.
26326 if (CX->getAPIntValue() == CY->getAPIntValue())
26327 return SDValue();
26328
26330 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
26331 SDValue Cond = CmpLHS->getOperand(3);
26332
26333 if (CmpRHS == Y)
26335 else if (CmpRHS != X)
26336 return SDValue();
26337
26338 if (OpCC == AArch64CC::NE)
26340 else if (OpCC != AArch64CC::EQ)
26341 return SDValue();
26342
26343 SDLoc DL(Op);
26344 EVT VT = Op->getValueType(0);
26345
26346 SDValue CCValue = getCondCode(DAG, CC);
26347 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
26348}
26349
26350// Reassociate the true/false expressions of a CSEL instruction to obtain a
26351// common subexpression with the comparison instruction. For example, change
26352// (CSEL (ADD (ADD x y) -c) f LO (SUBS x c)) to
26353// (CSEL (ADD (SUBS x c) y) f LO (SUBS x c)) such that (SUBS x c) is a common
26354// subexpression.
26356 SDValue SubsNode = N->getOperand(3);
26357 if (SubsNode.getOpcode() != AArch64ISD::SUBS || !SubsNode.hasOneUse())
26358 return SDValue();
26359
26360 SDValue CmpOpToMatch = SubsNode.getOperand(1);
26361 SDValue CmpOpOther = SubsNode.getOperand(0);
26362 EVT VT = N->getValueType(0);
26363
26364 unsigned ExpectedOpcode;
26365 SDValue ExpectedOp;
26366 SDValue SubsOp;
26367 auto *CmpOpConst = dyn_cast<ConstantSDNode>(CmpOpToMatch);
26368 if (CmpOpConst) {
26369 ExpectedOpcode = ISD::ADD;
26370 ExpectedOp =
26371 DAG.getConstant(-CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
26372 CmpOpConst->getValueType(0));
26373 SubsOp = DAG.getConstant(CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
26374 CmpOpConst->getValueType(0));
26375 } else {
26376 ExpectedOpcode = ISD::SUB;
26377 ExpectedOp = CmpOpToMatch;
26378 SubsOp = CmpOpToMatch;
26379 }
26380
26381 // Get the operand that can be reassociated with the SUBS instruction.
26382 auto GetReassociationOp = [&](SDValue Op, SDValue ExpectedOp) {
26383 if (Op.getOpcode() != ExpectedOpcode)
26384 return SDValue();
26385 if (Op.getOperand(0).getOpcode() != ISD::ADD ||
26386 !Op.getOperand(0).hasOneUse())
26387 return SDValue();
26388 SDValue X = Op.getOperand(0).getOperand(0);
26389 SDValue Y = Op.getOperand(0).getOperand(1);
26390 if (X != CmpOpOther)
26391 std::swap(X, Y);
26392 if (X != CmpOpOther)
26393 return SDValue();
26394 if (ExpectedOp != Op.getOperand(1))
26395 return SDValue();
26396 return Y;
26397 };
26398
26399 // Try the reassociation using the given constant and condition code.
26400 auto Fold = [&](AArch64CC::CondCode NewCC, SDValue ExpectedOp,
26401 SDValue SubsOp) {
26402 SDValue TReassocOp = GetReassociationOp(N->getOperand(0), ExpectedOp);
26403 SDValue FReassocOp = GetReassociationOp(N->getOperand(1), ExpectedOp);
26404 if (!TReassocOp && !FReassocOp)
26405 return SDValue();
26406
26407 SDValue NewCmp =
26408 DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
26409 DAG.getVTList(VT, FlagsVT), CmpOpOther, SubsOp);
26410
26411 auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
26412 if (!ReassocOp)
26413 return N->getOperand(OpNum);
26414 SDValue Res = DAG.getNode(ISD::ADD, SDLoc(N->getOperand(OpNum)), VT,
26415 NewCmp.getValue(0), ReassocOp);
26416 DAG.ReplaceAllUsesWith(N->getOperand(OpNum), Res);
26417 return Res;
26418 };
26419
26420 SDValue TValReassoc = Reassociate(TReassocOp, 0);
26421 SDValue FValReassoc = Reassociate(FReassocOp, 1);
26422 return DAG.getNode(AArch64ISD::CSEL, SDLoc(N), VT, TValReassoc, FValReassoc,
26423 getCondCode(DAG, NewCC), NewCmp.getValue(1));
26424 };
26425
26426 auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
26427
26428 // First, try to eliminate the compare instruction by searching for a
26429 // subtraction with the same constant.
26430 if (SDValue R = Fold(CC, ExpectedOp, SubsOp))
26431 return R;
26432
26433 if (!CmpOpConst) {
26434 // Try again with the operands of the SUBS instruction and the condition
26435 // swapped. Due to canonicalization, this only helps for non-constant
26436 // operands of the SUBS instruction.
26437 std::swap(CmpOpToMatch, CmpOpOther);
26438 if (SDValue R = Fold(getSwappedCondition(CC), CmpOpToMatch, CmpOpToMatch))
26439 return R;
26440 return SDValue();
26441 }
26442
26443 if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && !CmpOpConst->isZero())
26444 return SDValue();
26445
26446 // Next, search for a subtraction with a slightly different constant. By
26447 // adjusting the condition code, we can still eliminate the compare
26448 // instruction. Adjusting the constant is only valid if it does not result
26449 // in signed/unsigned wrap for signed/unsigned comparisons, respectively.
26450 // Since such comparisons are trivially true/false, we should not encounter
26451 // them here but check for them nevertheless to be on the safe side.
26452 auto CheckedFold = [&](bool Check, APInt NewCmpConst,
26453 AArch64CC::CondCode NewCC) {
26454 auto ExpectedOp = DAG.getConstant(-NewCmpConst, SDLoc(CmpOpConst),
26455 CmpOpConst->getValueType(0));
26456 auto SubsOp = DAG.getConstant(NewCmpConst, SDLoc(CmpOpConst),
26457 CmpOpConst->getValueType(0));
26458 return Check ? Fold(NewCC, ExpectedOp, SubsOp) : SDValue();
26459 };
26460 switch (CC) {
26461 case AArch64CC::EQ:
26462 case AArch64CC::LS:
26463 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
26464 CmpOpConst->getAPIntValue() + 1, AArch64CC::LO);
26465 case AArch64CC::NE:
26466 case AArch64CC::HI:
26467 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
26468 CmpOpConst->getAPIntValue() + 1, AArch64CC::HS);
26469 case AArch64CC::LO:
26470 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
26471 CmpOpConst->getAPIntValue() - 1, AArch64CC::LS);
26472 case AArch64CC::HS:
26473 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
26474 CmpOpConst->getAPIntValue() - 1, AArch64CC::HI);
26475 case AArch64CC::LT:
26476 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
26477 CmpOpConst->getAPIntValue() - 1, AArch64CC::LE);
26478 case AArch64CC::LE:
26479 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
26480 CmpOpConst->getAPIntValue() + 1, AArch64CC::LT);
26481 case AArch64CC::GT:
26482 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
26483 CmpOpConst->getAPIntValue() + 1, AArch64CC::GE);
26484 case AArch64CC::GE:
26485 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
26486 CmpOpConst->getAPIntValue() - 1, AArch64CC::GT);
26487 default:
26488 return SDValue();
26489 }
26490}
26491
26493 AArch64CC::CondCode OpCC =
26494 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
26495
26496 if (OpCC != AArch64CC::NE)
26497 return SDValue();
26498
26499 SDValue PTest = Op->getOperand(3);
26500 if (PTest.getOpcode() != AArch64ISD::PTEST_ANY)
26501 return SDValue();
26502
26503 SDValue TruePred = PTest.getOperand(0);
26504 SDValue AnyPred = PTest.getOperand(1);
26505
26506 if (TruePred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
26507 TruePred = TruePred.getOperand(0);
26508
26509 if (AnyPred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
26510 AnyPred = AnyPred.getOperand(0);
26511
26512 if (TruePred != AnyPred && !isAllActivePredicate(DAG, TruePred))
26513 return SDValue();
26514
26515 SDValue LastB = Op->getOperand(0);
26516 SDValue Default = Op->getOperand(1);
26517
26518 if (LastB.getOpcode() != AArch64ISD::LASTB || LastB.getOperand(0) != AnyPred)
26519 return SDValue();
26520
26521 return DAG.getNode(AArch64ISD::CLASTB_N, SDLoc(Op), Op->getValueType(0),
26522 AnyPred, Default, LastB.getOperand(1));
26523}
26524
26525// Optimize CSEL instructions
26528 SelectionDAG &DAG) {
26529 // CSEL x, x, cc -> x
26530 if (N->getOperand(0) == N->getOperand(1))
26531 return N->getOperand(0);
26532
26533 if (SDValue R = foldCSELOfCSEL(N, DAG))
26534 return R;
26535
26536 // Try to reassociate the true/false expressions so that we can do CSE with
26537 // a SUBS instruction used to perform the comparison.
26539 return R;
26540
26541 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
26542 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
26543 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
26544 return Folded;
26545
26546 // CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x)
26547 // if SUB(y, x) already exists and we can produce a swapped predicate for cc.
26548 SDValue Cond = N->getOperand(3);
26549 if (DCI.isAfterLegalizeDAG() && Cond.getOpcode() == AArch64ISD::SUBS &&
26550 Cond.hasOneUse() && Cond->hasNUsesOfValue(0, 0) &&
26551 DAG.doesNodeExist(ISD::SUB, N->getVTList(),
26552 {Cond.getOperand(1), Cond.getOperand(0)}) &&
26553 !DAG.doesNodeExist(ISD::SUB, N->getVTList(),
26554 {Cond.getOperand(0), Cond.getOperand(1)}) &&
26555 !isNullConstant(Cond.getOperand(1))) {
26556 AArch64CC::CondCode OldCond =
26557 static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
26558 AArch64CC::CondCode NewCond = getSwappedCondition(OldCond);
26559 if (NewCond != AArch64CC::AL) {
26560 SDLoc DL(N);
26561 SDValue Sub = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
26562 Cond.getOperand(1), Cond.getOperand(0));
26563 return DAG.getNode(AArch64ISD::CSEL, DL, N->getVTList(), N->getOperand(0),
26564 N->getOperand(1), getCondCode(DAG, NewCond),
26565 Sub.getValue(1));
26566 }
26567 }
26568
26569 // CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z
26570 if (SDValue CondLast = foldCSELofLASTB(N, DAG))
26571 return CondLast;
26572
26573 return performCONDCombine(N, DCI, DAG, 2, 3);
26574}
26575
26576// Try to re-use an already extended operand of a vector SetCC feeding a
26577// extended select. Doing so avoids requiring another full extension of the
26578// SET_CC result when lowering the select.
26580 EVT Op0MVT = Op->getOperand(0).getValueType();
26581 if (!Op0MVT.isVector() || Op->use_empty())
26582 return SDValue();
26583
26584 // Make sure that all uses of Op are VSELECTs with result matching types where
26585 // the result type has a larger element type than the SetCC operand.
26586 SDNode *FirstUse = *Op->user_begin();
26587 if (FirstUse->getOpcode() != ISD::VSELECT)
26588 return SDValue();
26589 EVT UseMVT = FirstUse->getValueType(0);
26590 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
26591 return SDValue();
26592 if (any_of(Op->users(), [&UseMVT](const SDNode *N) {
26593 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
26594 }))
26595 return SDValue();
26596
26597 APInt V;
26598 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
26599 return SDValue();
26600
26601 SDLoc DL(Op);
26602 SDValue Op0ExtV;
26603 SDValue Op1ExtV;
26604 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
26605 // Check if the first operand of the SET_CC is already extended. If it is,
26606 // split the SET_CC and re-use the extended version of the operand.
26607 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
26608 Op->getOperand(0));
26609 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
26610 Op->getOperand(0));
26611 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
26612 Op0ExtV = SDValue(Op0SExt, 0);
26613 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
26614 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
26615 Op0ExtV = SDValue(Op0ZExt, 0);
26616 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
26617 } else
26618 return SDValue();
26619
26620 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
26621 Op0ExtV, Op1ExtV, Op->getOperand(2));
26622}
26623
26624static SDValue
26626 SelectionDAG &DAG) {
26627 SDValue Vec = N->getOperand(0);
26628 if (DCI.isBeforeLegalize() &&
26629 Vec.getValueType().getVectorElementType() == MVT::i1 &&
26632 SDLoc DL(N);
26633 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
26634 DAG);
26635 }
26636
26637 return SDValue();
26638}
26639
26642 SelectionDAG &DAG) {
26643 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
26644 SDValue LHS = N->getOperand(0);
26645 SDValue RHS = N->getOperand(1);
26646 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
26647 SDLoc DL(N);
26648 EVT VT = N->getValueType(0);
26649
26650 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
26651 return V;
26652
26653 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
26654 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
26655 LHS->getOpcode() == AArch64ISD::CSEL &&
26656 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
26657 LHS->hasOneUse()) {
26658 // Invert CSEL's condition.
26659 auto OldCond =
26660 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
26661 auto NewCond = getInvertedCondCode(OldCond);
26662
26663 // csel 0, 1, !cond, X
26664 SDValue CSEL = DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(),
26665 LHS.getOperand(0), LHS.getOperand(1),
26666 getCondCode(DAG, NewCond), LHS.getOperand(3));
26667 return DAG.getZExtOrTrunc(CSEL, DL, VT);
26668 }
26669
26670 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
26671 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
26672 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
26673 LHS->hasOneUse()) {
26674 EVT TstVT = LHS->getValueType(0);
26675 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64 &&
26676 LHS->getConstantOperandVal(1) < TstVT.getFixedSizeInBits()) {
26677 // this pattern will get better opt in emitComparison
26678 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
26679 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
26680 DAG.getSignedConstant(TstImm, DL, TstVT));
26681 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
26682 }
26683 }
26684
26685 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
26686 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
26687 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
26688 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
26689 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
26690 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
26692 LHS->getOpcode() == ISD::BITCAST) {
26693 EVT ToVT = LHS->getValueType(0);
26694 EVT FromVT = LHS->getOperand(0).getValueType();
26695 if (FromVT.isFixedLengthVector() &&
26696 FromVT.getVectorElementType() == MVT::i1) {
26697 bool IsNull = isNullConstant(RHS);
26698 LHS = DAG.getNode(IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND,
26699 DL, MVT::i1, LHS->getOperand(0));
26700 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
26701 LHS);
26702 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
26703 }
26704 }
26705
26706 // Try to perform the memcmp when the result is tested for [in]equality with 0
26707 if (SDValue V = performOrXorChainCombine(N, DAG))
26708 return V;
26709
26710 EVT CmpVT = LHS.getValueType();
26711
26712 // NOTE: This exists as a combine only because it proved too awkward to match
26713 // splat(1) across all the NEON types during isel.
26714 APInt SplatLHSVal;
26715 if (CmpVT.isInteger() && Cond == ISD::SETGT &&
26716 ISD::isConstantSplatVector(LHS.getNode(), SplatLHSVal) &&
26717 SplatLHSVal.isOne())
26718 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, CmpVT), RHS, ISD::SETGE);
26719
26720 return SDValue();
26721}
26722
26723// Replace a flag-setting operator (eg ANDS) with the generic version
26724// (eg AND) if the flag is unused.
26727 unsigned GenericOpcode) {
26728 SDLoc DL(N);
26729 SDValue LHS = N->getOperand(0);
26730 SDValue RHS = N->getOperand(1);
26731 EVT VT = N->getValueType(0);
26732
26733 // If the flag result isn't used, convert back to a generic opcode.
26734 if (!N->hasAnyUseOfValue(1)) {
26735 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
26736 return DCI.CombineTo(N, Res, SDValue(N, 1));
26737 }
26738
26739 // Combine equivalent generic nodes into this node, re-using the result.
26740 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
26741 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS},
26742 /*AllowCommute=*/true))
26743 DCI.CombineTo(Generic, SDValue(N, 0));
26744
26745 return SDValue();
26746}
26747
26749 // setcc_merge_zero pred
26750 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
26751 // => extract_subvector (inner setcc_merge_zero)
26752 SDValue Pred = N->getOperand(0);
26753 SDValue LHS = N->getOperand(1);
26754 SDValue RHS = N->getOperand(2);
26755 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
26756
26757 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
26758 LHS->getOpcode() != ISD::SIGN_EXTEND)
26759 return SDValue();
26760
26761 SDValue Extract = LHS->getOperand(0);
26762 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
26763 Extract->getValueType(0) != N->getValueType(0) ||
26764 Extract->getConstantOperandVal(1) != 0)
26765 return SDValue();
26766
26767 SDValue InnerSetCC = Extract->getOperand(0);
26768 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
26769 return SDValue();
26770
26771 // By this point we've effectively got
26772 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
26773 // lanes are already zero then the trunc(sext()) sequence is redundant and we
26774 // can operate on A directly.
26775 SDValue InnerPred = InnerSetCC.getOperand(0);
26776 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
26777 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
26778 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
26779 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
26780 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
26781 return Extract;
26782
26783 return SDValue();
26784}
26785
26786static bool isSignExtInReg(const SDValue &V) {
26787 if (V.getOpcode() != AArch64ISD::VASHR ||
26788 V.getOperand(0).getOpcode() != AArch64ISD::VSHL)
26789 return false;
26790
26791 unsigned BitWidth = V->getValueType(0).getScalarSizeInBits();
26792 unsigned ShiftAmtR = V.getConstantOperandVal(1);
26793 unsigned ShiftAmtL = V.getOperand(0).getConstantOperandVal(1);
26794 return (ShiftAmtR == ShiftAmtL && ShiftAmtR == (BitWidth - 1));
26795}
26796
26797static SDValue
26799 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
26800 "Unexpected opcode!");
26801
26802 SelectionDAG &DAG = DCI.DAG;
26803 SDValue Pred = N->getOperand(0);
26804 SDValue LHS = N->getOperand(1);
26805 SDValue RHS = N->getOperand(2);
26806 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
26807
26808 if (SDValue V = performSetCCPunpkCombine(N, DAG))
26809 return V;
26810
26811 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
26812 LHS->getOpcode() == ISD::SIGN_EXTEND &&
26813 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
26814 // setcc_merge_zero(
26815 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
26816 // => setcc_merge_zero(pred, ...)
26817 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
26818 LHS->getOperand(0)->getOperand(0) == Pred)
26819 return LHS->getOperand(0);
26820
26821 // setcc_merge_zero(
26822 // all_active, extend(nxvNi1 ...), != splat(0))
26823 // -> nxvNi1 ...
26824 if (isAllActivePredicate(DAG, Pred))
26825 return LHS->getOperand(0);
26826
26827 // setcc_merge_zero(
26828 // pred, extend(nxvNi1 ...), != splat(0))
26829 // -> nxvNi1 and(pred, ...)
26830 if (DCI.isAfterLegalizeDAG())
26831 // Do this after legalization to allow more folds on setcc_merge_zero
26832 // to be recognized.
26833 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
26834 LHS->getOperand(0), Pred);
26835 }
26836
26837 // setcc_merge_zero(
26838 // pred, insert_subvector(undef, signext_inreg(vNi1), 0), != splat(0))
26839 // => setcc_merge_zero(
26840 // pred, insert_subvector(undef, shl(vNi1), 0), != splat(0))
26841 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
26842 LHS->getOpcode() == ISD::INSERT_SUBVECTOR && LHS.hasOneUse()) {
26843 SDValue L0 = LHS->getOperand(0);
26844 SDValue L1 = LHS->getOperand(1);
26845 SDValue L2 = LHS->getOperand(2);
26846
26847 if (L0.isUndef() && isNullConstant(L2) && isSignExtInReg(L1)) {
26848 SDLoc DL(N);
26849 SDValue Shl = L1.getOperand(0);
26851 LHS.getValueType(), L0, Shl, L2);
26852 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, N->getValueType(0),
26853 Pred, NewLHS, RHS, N->getOperand(3));
26854 }
26855 }
26856
26857 return SDValue();
26858}
26859
26860// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
26861// as well as whether the test should be inverted. This code is required to
26862// catch these cases (as opposed to standard dag combines) because
26863// AArch64ISD::TBZ is matched during legalization.
26864static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
26865 SelectionDAG &DAG) {
26866
26867 if (!Op->hasOneUse())
26868 return Op;
26869
26870 // We don't handle undef/constant-fold cases below, as they should have
26871 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
26872 // etc.)
26873
26874 // (tbz (trunc x), b) -> (tbz x, b)
26875 // This case is just here to enable more of the below cases to be caught.
26876 if (Op->getOpcode() == ISD::TRUNCATE &&
26877 Bit < Op->getValueType(0).getSizeInBits()) {
26878 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26879 }
26880
26881 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
26882 if (Op->getOpcode() == ISD::ANY_EXTEND &&
26883 Bit < Op->getOperand(0).getValueSizeInBits()) {
26884 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26885 }
26886
26887 if (Op->getNumOperands() != 2)
26888 return Op;
26889
26890 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
26891 if (!C)
26892 return Op;
26893
26894 switch (Op->getOpcode()) {
26895 default:
26896 return Op;
26897
26898 // (tbz (and x, m), b) -> (tbz x, b)
26899 case ISD::AND:
26900 if ((C->getZExtValue() >> Bit) & 1)
26901 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26902 return Op;
26903
26904 // (tbz (shl x, c), b) -> (tbz x, b-c)
26905 case ISD::SHL:
26906 if (C->getZExtValue() <= Bit &&
26907 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
26908 Bit = Bit - C->getZExtValue();
26909 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26910 }
26911 return Op;
26912
26913 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
26914 case ISD::SRA:
26915 Bit = Bit + C->getZExtValue();
26916 if (Bit >= Op->getValueType(0).getSizeInBits())
26917 Bit = Op->getValueType(0).getSizeInBits() - 1;
26918 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26919
26920 // (tbz (srl x, c), b) -> (tbz x, b+c)
26921 case ISD::SRL:
26922 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
26923 Bit = Bit + C->getZExtValue();
26924 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26925 }
26926 return Op;
26927
26928 // (tbz (xor x, -1), b) -> (tbnz x, b)
26929 case ISD::XOR:
26930 if ((C->getZExtValue() >> Bit) & 1)
26931 Invert = !Invert;
26932 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
26933 }
26934}
26935
26936// Optimize test single bit zero/non-zero and branch.
26939 SelectionDAG &DAG) {
26940 unsigned Bit = N->getConstantOperandVal(2);
26941 bool Invert = false;
26942 SDValue TestSrc = N->getOperand(1);
26943 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
26944
26945 if (TestSrc == NewTestSrc)
26946 return SDValue();
26947
26948 unsigned NewOpc = N->getOpcode();
26949 if (Invert) {
26950 if (NewOpc == AArch64ISD::TBZ)
26951 NewOpc = AArch64ISD::TBNZ;
26952 else {
26953 assert(NewOpc == AArch64ISD::TBNZ);
26954 NewOpc = AArch64ISD::TBZ;
26955 }
26956 }
26957
26958 SDLoc DL(N);
26959 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
26960 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
26961}
26962
26963// Swap vselect operands where it may allow a predicated operation to achieve
26964// the `sel`.
26965//
26966// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
26967// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
26969 auto SelectA = N->getOperand(1);
26970 auto SelectB = N->getOperand(2);
26971 auto NTy = N->getValueType(0);
26972
26973 if (!NTy.isScalableVector())
26974 return SDValue();
26975 SDValue SetCC = N->getOperand(0);
26976 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
26977 return SDValue();
26978
26979 switch (SelectB.getOpcode()) {
26980 default:
26981 return SDValue();
26982 case ISD::FMUL:
26983 case ISD::FSUB:
26984 case ISD::FADD:
26985 break;
26986 }
26987 if (SelectA != SelectB.getOperand(0))
26988 return SDValue();
26989
26990 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
26991 ISD::CondCode InverseCC =
26993 auto InverseSetCC =
26994 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
26995 SetCC.getOperand(1), InverseCC);
26996
26997 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
26998 {InverseSetCC, SelectB, SelectA});
26999}
27000
27001// vselect (v1i1 setcc) ->
27002// vselect (v1iXX setcc) (XX is the size of the compared operand type)
27003// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
27004// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
27005// such VSELECT.
27007 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
27008 return SwapResult;
27009
27010 SDValue N0 = N->getOperand(0);
27011 SDValue IfTrue = N->getOperand(1);
27012 SDValue IfFalse = N->getOperand(2);
27013 EVT ResVT = N->getValueType(0);
27014 EVT CCVT = N0.getValueType();
27015
27016 if (isAllActivePredicate(DAG, N0))
27017 return N->getOperand(1);
27018
27019 if (isAllInactivePredicate(N0))
27020 return N->getOperand(2);
27021
27022 if (isMergePassthruOpcode(IfTrue.getOpcode()) && IfTrue.hasOneUse()) {
27023 // vselect A, (merge_pasthru_op all_active, B,{Bn,} -), C
27024 // vselect A, (merge_pasthru_op -, B,{Bn,} undef), C
27025 // vselect A, (merge_pasthru_op A, B,{Bn,} -), C
27026 // -> merge_pasthru_op A, B,{Bn,} C
27027 if (isAllActivePredicate(DAG, IfTrue->getOperand(0)) ||
27028 IfTrue->getOperand(IfTrue.getNumOperands() - 1).isUndef() ||
27029 IfTrue->getOperand(0) == N0) {
27031 Ops[0] = N0;
27032 Ops[IfTrue.getNumOperands() - 1] = IfFalse;
27033
27034 return DAG.getNode(IfTrue.getOpcode(), SDLoc(N), ResVT, Ops);
27035 }
27036 }
27037
27038 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
27039 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
27040 // supported types.
27041 SDValue SetCC = N->getOperand(0);
27042 if (SetCC.getOpcode() == ISD::SETCC &&
27043 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
27044 SDValue CmpLHS = SetCC.getOperand(0);
27045 EVT VT = CmpLHS.getValueType();
27046 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
27047 SDNode *SplatLHS = N->getOperand(1).getNode();
27048 SDNode *SplatRHS = N->getOperand(2).getNode();
27049 APInt SplatLHSVal;
27050 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
27051 VT.isSimple() &&
27052 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
27053 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
27054 VT.getSimpleVT().SimpleTy) &&
27055 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
27056 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
27058 unsigned NumElts = VT.getVectorNumElements();
27060 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
27061 VT.getScalarType()));
27062 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
27063
27064 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
27065 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
27066 return Or;
27067 }
27068 }
27069
27070 EVT CmpVT = N0.getOperand(0).getValueType();
27071 if (N0.getOpcode() != ISD::SETCC ||
27073 CCVT.getVectorElementType() != MVT::i1 ||
27075 return SDValue();
27076
27077 // Only combine when the result type is of the same size as the compared
27078 // operands.
27079 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
27080 return SDValue();
27081
27082 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
27083 N0.getOperand(0), N0.getOperand(1),
27084 cast<CondCodeSDNode>(N0.getOperand(2))->get());
27085 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
27086 IfTrue, IfFalse);
27087}
27088
27089/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
27090/// the compare-mask instructions rather than going via NZCV, even if LHS and
27091/// RHS are really scalar. This replaces any scalar setcc in the above pattern
27092/// with a vector one followed by a DUP shuffle on the result.
27095 SelectionDAG &DAG = DCI.DAG;
27096 SDValue N0 = N->getOperand(0);
27097 EVT ResVT = N->getValueType(0);
27098
27099 if (N0.getOpcode() != ISD::SETCC)
27100 return SDValue();
27101
27102 if (ResVT.isScalableVT())
27103 return SDValue();
27104
27105 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
27106 // scalar SetCCResultType. We also don't expect vectors, because we assume
27107 // that selects fed by vector SETCCs are canonicalized to VSELECT.
27108 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
27109 "Scalar-SETCC feeding SELECT has unexpected result type!");
27110
27111 // Don't try to do this optimization when the setcc itself has i1 operands.
27112 // There are no legal vectors of i1, so this would be pointless. v1f16 is
27113 // ruled out to prevent the creation of setcc that need to be scalarized.
27114 EVT SrcVT = N0.getOperand(0).getValueType();
27115 if (SrcVT == MVT::i1 ||
27116 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
27117 return SDValue();
27118
27119 // If NumMaskElts == 0, the comparison is larger than select result. The
27120 // largest real NEON comparison is 64-bits per lane, which means the result is
27121 // at most 32-bits and an illegal vector. Just bail out for now.
27122 unsigned NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
27123 if (!ResVT.isVector() || NumMaskElts == 0)
27124 return SDValue();
27125
27126 // Avoid creating vectors with excessive VFs before legalization.
27127 if (DCI.isBeforeLegalize() && NumMaskElts != ResVT.getVectorNumElements())
27128 return SDValue();
27129
27130 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
27132
27133 // Also bail out if the vector CCVT isn't the same size as ResVT.
27134 // This can happen if the SETCC operand size doesn't divide the ResVT size
27135 // (e.g., f64 vs v3f32).
27136 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
27137 return SDValue();
27138
27139 // Make sure we didn't create illegal types, if we're not supposed to.
27140 assert(DCI.isBeforeLegalize() ||
27141 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
27142
27143 // First perform a vector comparison, where lane 0 is the one we're interested
27144 // in.
27145 SDLoc DL(N0);
27146 SDValue LHS =
27147 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
27148 SDValue RHS =
27149 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
27150 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
27151
27152 // Now duplicate the comparison mask we want across all other lanes.
27153 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
27154 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
27155 Mask = DAG.getNode(ISD::BITCAST, DL,
27156 ResVT.changeVectorElementTypeToInteger(), Mask);
27157
27158 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
27159}
27160
27163 EVT VT = N->getValueType(0);
27164 SDLoc DL(N);
27165 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
27166 // 128bit vector version.
27167 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
27169 SmallVector<SDValue> Ops(N->ops());
27170 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
27171 DCI.DAG.getVTList(LVT), Ops)) {
27172 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
27173 DCI.DAG.getConstant(0, DL, MVT::i64));
27174 }
27175 }
27176
27177 if (N->getOpcode() == AArch64ISD::DUP) {
27178 SDValue Op = N->getOperand(0);
27179
27180 // Optimize DUP(extload/zextload i8/i16/i32) to avoid GPR->FPR transfer.
27181 // For example:
27182 // v4i32 = DUP (i32 (zextloadi8 addr))
27183 // =>
27184 // v4i32 = SCALAR_TO_VECTOR (i32 (zextloadi8 addr)) ; Matches to ldr b0
27185 // v4i32 = DUPLANE32 (v4i32), 0
27186 if (auto *LD = dyn_cast<LoadSDNode>(Op)) {
27187 ISD::LoadExtType ExtType = LD->getExtensionType();
27188 EVT MemVT = LD->getMemoryVT();
27189 EVT ElemVT = VT.getVectorElementType();
27190 if ((ExtType == ISD::EXTLOAD || ExtType == ISD::ZEXTLOAD) &&
27191 (MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) &&
27192 ElemVT != MemVT && LD->hasOneUse()) {
27193 EVT Vec128VT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
27194 128 / ElemVT.getSizeInBits());
27195 SDValue ScalarToVec =
27196 DCI.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, Vec128VT, Op);
27197 return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, ScalarToVec,
27198 DCI.DAG.getConstant(0, DL, MVT::i64));
27199 }
27200 }
27201
27202 // If the instruction is known to produce a scalar in SIMD registers, we can
27203 // duplicate it across the vector lanes using DUPLANE instead of moving it
27204 // to a GPR first. For example, this allows us to handle:
27205 // v4i32 = DUP (i32 (FCMGT (f32, f32)))
27206 // FIXME: Ideally, we should be able to handle all instructions that
27207 // produce a scalar value in FPRs.
27208 if (Op.getOpcode() == AArch64ISD::FCMEQ ||
27209 Op.getOpcode() == AArch64ISD::FCMGE ||
27210 Op.getOpcode() == AArch64ISD::FCMGT) {
27211 EVT ElemVT = VT.getVectorElementType();
27212 EVT ExpandedVT = VT;
27213 // Insert into a 128-bit vector to match DUPLANE's pattern.
27214 if (VT.getSizeInBits() != 128)
27215 ExpandedVT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
27216 128 / ElemVT.getSizeInBits());
27217 SDValue Zero = DCI.DAG.getConstant(0, DL, MVT::i64);
27218 SDValue Vec = DCI.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpandedVT,
27219 DCI.DAG.getUNDEF(ExpandedVT), Op, Zero);
27220 return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, Vec, Zero);
27221 }
27222
27223 if (DCI.isAfterLegalizeDAG()) {
27224 // If scalar dup's operand is extract_vector_elt, try to combine them into
27225 // duplane. For example,
27226 //
27227 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
27228 // t18: v4i32 = AArch64ISD::DUP t21
27229 // ==>
27230 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
27231 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
27232 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
27233 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
27234 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
27235 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
27236 EXTRACT_VEC_ELT.getOperand(1));
27237 }
27238 }
27239 }
27240
27241 return performPostLD1Combine(N, DCI, false);
27242 }
27243
27244 return SDValue();
27245}
27246
27247/// Get rid of unnecessary NVCASTs (that don't change the type).
27249 if (N->getValueType(0) == N->getOperand(0).getValueType())
27250 return N->getOperand(0);
27251 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
27252 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
27253 N->getOperand(0).getOperand(0));
27254
27255 return SDValue();
27256}
27257
27258// If all users of the globaladdr are of the form (globaladdr + constant), find
27259// the smallest constant, fold it into the globaladdr's offset and rewrite the
27260// globaladdr as (globaladdr + constant) - constant.
27262 const AArch64Subtarget *Subtarget,
27263 const TargetMachine &TM) {
27264 auto *GN = cast<GlobalAddressSDNode>(N);
27265 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
27267 return SDValue();
27268
27269 uint64_t MinOffset = -1ull;
27270 for (SDNode *N : GN->users()) {
27271 if (N->getOpcode() != ISD::ADD)
27272 return SDValue();
27273 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
27274 if (!C)
27275 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
27276 if (!C)
27277 return SDValue();
27278 MinOffset = std::min(MinOffset, C->getZExtValue());
27279 }
27280 uint64_t Offset = MinOffset + GN->getOffset();
27281
27282 // Require that the new offset is larger than the existing one. Otherwise, we
27283 // can end up oscillating between two possible DAGs, for example,
27284 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
27285 if (Offset <= uint64_t(GN->getOffset()))
27286 return SDValue();
27287
27288 // Check whether folding this offset is legal. It must not go out of bounds of
27289 // the referenced object to avoid violating the code model, and must be
27290 // smaller than 2^20 because this is the largest offset expressible in all
27291 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
27292 // stores an immediate signed 21 bit offset.)
27293 //
27294 // This check also prevents us from folding negative offsets, which will end
27295 // up being treated in the same way as large positive ones. They could also
27296 // cause code model violations, and aren't really common enough to matter.
27297 if (Offset >= (1 << 20))
27298 return SDValue();
27299
27300 const GlobalValue *GV = GN->getGlobal();
27301 Type *T = GV->getValueType();
27302 if (!T->isSized() ||
27304 return SDValue();
27305
27306 SDLoc DL(GN);
27307 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
27308 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
27309 DAG.getConstant(MinOffset, DL, MVT::i64));
27310}
27311
27313 const AArch64Subtarget *Subtarget) {
27314 SDValue BR = N->getOperand(0);
27315 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
27317 return SDValue();
27318
27319 SDLoc DL(N);
27320 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
27321}
27322
27323// Turns the vector of indices into a vector of byte offstes by scaling Offset
27324// by (BitWidth / 8).
27326 SDLoc DL, unsigned BitWidth) {
27327 assert(Offset.getValueType().isScalableVector() &&
27328 "This method is only for scalable vectors of offsets");
27329
27330 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
27331 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
27332
27333 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
27334}
27335
27336/// Check if the value of \p OffsetInBytes can be used as an immediate for
27337/// the gather load/prefetch and scatter store instructions with vector base and
27338/// immediate offset addressing mode:
27339///
27340/// [<Zn>.[S|D]{, #<imm>}]
27341///
27342/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
27343inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
27344 unsigned ScalarSizeInBytes) {
27345 // The immediate is not a multiple of the scalar size.
27346 if (OffsetInBytes % ScalarSizeInBytes)
27347 return false;
27348
27349 // The immediate is out of range.
27350 if (OffsetInBytes / ScalarSizeInBytes > 31)
27351 return false;
27352
27353 return true;
27354}
27355
27356/// Check if the value of \p Offset represents a valid immediate for the SVE
27357/// gather load/prefetch and scatter store instructiona with vector base and
27358/// immediate offset addressing mode:
27359///
27360/// [<Zn>.[S|D]{, #<imm>}]
27361///
27362/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
27364 unsigned ScalarSizeInBytes) {
27365 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
27366 return OffsetConst && isValidImmForSVEVecImmAddrMode(
27367 OffsetConst->getZExtValue(), ScalarSizeInBytes);
27368}
27369
27371 unsigned Opcode,
27372 bool OnlyPackedOffsets = true) {
27373 const SDValue Src = N->getOperand(2);
27374 const EVT SrcVT = Src->getValueType(0);
27375 assert(SrcVT.isScalableVector() &&
27376 "Scatter stores are only possible for SVE vectors");
27377
27378 SDLoc DL(N);
27379 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
27380
27381 // Make sure that source data will fit into an SVE register
27383 return SDValue();
27384
27385 // For FPs, ACLE only supports _packed_ single and double precision types.
27386 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
27387 if (SrcElVT.isFloatingPoint())
27388 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
27389 ((Opcode != AArch64ISD::SST1Q_PRED &&
27390 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
27391 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
27392 return SDValue();
27393
27394 // Depending on the addressing mode, this is either a pointer or a vector of
27395 // pointers (that fits into one register)
27396 SDValue Base = N->getOperand(4);
27397 // Depending on the addressing mode, this is either a single offset or a
27398 // vector of offsets (that fits into one register)
27399 SDValue Offset = N->getOperand(5);
27400
27401 // For "scalar + vector of indices", just scale the indices. This only
27402 // applies to non-temporal scatters because there's no instruction that takes
27403 // indices.
27404 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
27405 Offset =
27407 Opcode = AArch64ISD::SSTNT1_PRED;
27408 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
27409 Offset =
27411 Opcode = AArch64ISD::SST1Q_PRED;
27412 }
27413
27414 // In the case of non-temporal gather loads there's only one SVE instruction
27415 // per data-size: "scalar + vector", i.e.
27416 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
27417 // Since we do have intrinsics that allow the arguments to be in a different
27418 // order, we may need to swap them to match the spec.
27419 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
27420 Offset.getValueType().isVector())
27422
27423 // SST1_IMM requires that the offset is an immediate that is:
27424 // * a multiple of #SizeInBytes,
27425 // * in the range [0, 31 x #SizeInBytes],
27426 // where #SizeInBytes is the size in bytes of the stored items. For
27427 // immediates outside that range and non-immediate scalar offsets use SST1 or
27428 // SST1_UXTW instead.
27429 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
27431 SrcVT.getScalarSizeInBits() / 8)) {
27432 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
27433 Opcode = AArch64ISD::SST1_UXTW_PRED;
27434 else
27435 Opcode = AArch64ISD::SST1_PRED;
27436
27438 }
27439 }
27440
27441 auto &TLI = DAG.getTargetLoweringInfo();
27442 if (!TLI.isTypeLegal(Base.getValueType()))
27443 return SDValue();
27444
27445 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
27446 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
27447 // nxv2i64. Legalize accordingly.
27448 if (!OnlyPackedOffsets &&
27449 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
27450 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
27451
27452 if (!TLI.isTypeLegal(Offset.getValueType()))
27453 return SDValue();
27454
27455 // Source value type that is representable in hardware
27456 EVT HwSrcVt = getSVEContainerType(SrcVT);
27457
27458 // Keep the original type of the input data to store - this is needed to be
27459 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
27460 // FP values we want the integer equivalent, so just use HwSrcVt.
27461 SDValue InputVT = DAG.getValueType(SrcVT);
27462 if (SrcVT.isFloatingPoint())
27463 InputVT = DAG.getValueType(HwSrcVt);
27464
27465 SDVTList VTs = DAG.getVTList(MVT::Other);
27466 SDValue SrcNew;
27467
27468 if (Src.getValueType().isFloatingPoint())
27469 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
27470 else
27471 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
27472
27473 SDValue Ops[] = {N->getOperand(0), // Chain
27474 SrcNew,
27475 N->getOperand(3), // Pg
27476 Base,
27477 Offset,
27478 InputVT};
27479
27480 return DAG.getNode(Opcode, DL, VTs, Ops);
27481}
27482
27484 unsigned Opcode,
27485 bool OnlyPackedOffsets = true) {
27486 const EVT RetVT = N->getValueType(0);
27487 assert(RetVT.isScalableVector() &&
27488 "Gather loads are only possible for SVE vectors");
27489
27490 SDLoc DL(N);
27491
27492 // Make sure that the loaded data will fit into an SVE register
27494 return SDValue();
27495
27496 // Depending on the addressing mode, this is either a pointer or a vector of
27497 // pointers (that fits into one register)
27498 SDValue Base = N->getOperand(3);
27499 // Depending on the addressing mode, this is either a single offset or a
27500 // vector of offsets (that fits into one register)
27501 SDValue Offset = N->getOperand(4);
27502
27503 // For "scalar + vector of indices", scale the indices to obtain unscaled
27504 // offsets. This applies to non-temporal and quadword gathers, which do not
27505 // have an addressing mode with scaled offset.
27506 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
27508 RetVT.getScalarSizeInBits());
27509 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
27510 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
27512 RetVT.getScalarSizeInBits());
27513 Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
27514 }
27515
27516 // In the case of non-temporal gather loads and quadword gather loads there's
27517 // only one addressing mode : "vector + scalar", e.g.
27518 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
27519 // Since we do have intrinsics that allow the arguments to be in a different
27520 // order, we may need to swap them to match the spec.
27521 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
27522 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
27523 Offset.getValueType().isVector())
27525
27526 // GLD{FF}1_IMM requires that the offset is an immediate that is:
27527 // * a multiple of #SizeInBytes,
27528 // * in the range [0, 31 x #SizeInBytes],
27529 // where #SizeInBytes is the size in bytes of the loaded items. For
27530 // immediates outside that range and non-immediate scalar offsets use
27531 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
27532 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
27533 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
27535 RetVT.getScalarSizeInBits() / 8)) {
27536 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
27537 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
27538 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
27539 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
27540 else
27541 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
27542 ? AArch64ISD::GLD1_MERGE_ZERO
27543 : AArch64ISD::GLDFF1_MERGE_ZERO;
27544
27546 }
27547 }
27548
27549 auto &TLI = DAG.getTargetLoweringInfo();
27550 if (!TLI.isTypeLegal(Base.getValueType()))
27551 return SDValue();
27552
27553 // Some gather load variants allow unpacked offsets, but only as nxv2i32
27554 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
27555 // nxv2i64. Legalize accordingly.
27556 if (!OnlyPackedOffsets &&
27557 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
27558 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
27559
27560 // Return value type that is representable in hardware
27561 EVT HwRetVt = getSVEContainerType(RetVT);
27562
27563 // Keep the original output value type around - this is needed to be able to
27564 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
27565 // values we want the integer equivalent, so just use HwRetVT.
27566 SDValue OutVT = DAG.getValueType(RetVT);
27567 if (RetVT.isFloatingPoint())
27568 OutVT = DAG.getValueType(HwRetVt);
27569
27570 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
27571 SDValue Ops[] = {N->getOperand(0), // Chain
27572 N->getOperand(2), // Pg
27573 Base, Offset, OutVT};
27574
27575 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
27576 SDValue LoadChain = SDValue(Load.getNode(), 1);
27577
27578 if (RetVT.isInteger() && (RetVT != HwRetVt))
27579 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
27580
27581 // If the original return value was FP, bitcast accordingly. Doing it here
27582 // means that we can avoid adding TableGen patterns for FPs.
27583 if (RetVT.isFloatingPoint())
27584 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
27585
27586 return DAG.getMergeValues({Load, LoadChain}, DL);
27587}
27588
27589static SDValue
27591 SelectionDAG &DAG) {
27592 SDLoc DL(N);
27593 SDValue Src = N->getOperand(0);
27594 unsigned Opc = Src->getOpcode();
27595
27596 // Sign extend of an unsigned unpack -> signed unpack
27597 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
27598
27599 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
27600 : AArch64ISD::SUNPKLO;
27601
27602 // Push the sign extend to the operand of the unpack
27603 // This is necessary where, for example, the operand of the unpack
27604 // is another unpack:
27605 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
27606 // ->
27607 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
27608 // ->
27609 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
27610 SDValue ExtOp = Src->getOperand(0);
27611 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
27612 EVT EltTy = VT.getVectorElementType();
27613 (void)EltTy;
27614
27615 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
27616 "Sign extending from an invalid type");
27617
27618 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
27619
27621 ExtOp, DAG.getValueType(ExtVT));
27622
27623 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
27624 }
27625
27626 // Sign extend of CSET -> CSETM.
27627 if (Opc == AArch64ISD::CSEL &&
27628 cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1) {
27629 EVT VT = N->getValueType(0);
27630 SDValue TVal = Src.getOperand(0);
27631 SDValue FVal = Src.getOperand(1);
27632
27633 // SIGN_EXTEND_INREG (CSEL 0, 1, cc, NZCV), i1 --> CSEL 0, -1, cc, NZCV
27634 if (isNullConstant(TVal) && isOneConstant(FVal))
27635 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal,
27636 DAG.getAllOnesConstant(DL, VT), Src.getOperand(2),
27637 Src.getOperand(3));
27638
27639 // SIGN_EXTEND_INREG (CSEL 1, 0, cc, NZCV), i1 --> CSEL -1, 0, cc, NZCV
27640 if (isOneConstant(TVal) && isNullConstant(FVal))
27641 return DAG.getNode(AArch64ISD::CSEL, DL, VT,
27642 DAG.getAllOnesConstant(DL, VT), FVal,
27643 Src.getOperand(2), Src.getOperand(3));
27644 }
27645
27646 if (DCI.isBeforeLegalizeOps())
27647 return SDValue();
27648
27650 return SDValue();
27651
27652 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
27653 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
27654 unsigned NewOpc;
27655 unsigned MemVTOpNum = 4;
27656 switch (Opc) {
27657 case AArch64ISD::LD1_MERGE_ZERO:
27658 NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
27659 MemVTOpNum = 3;
27660 break;
27661 case AArch64ISD::LDNF1_MERGE_ZERO:
27662 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
27663 MemVTOpNum = 3;
27664 break;
27665 case AArch64ISD::LDFF1_MERGE_ZERO:
27666 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
27667 MemVTOpNum = 3;
27668 break;
27669 case AArch64ISD::GLD1_MERGE_ZERO:
27670 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
27671 break;
27672 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
27673 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
27674 break;
27675 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
27676 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
27677 break;
27678 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
27679 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
27680 break;
27681 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
27682 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
27683 break;
27684 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
27685 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
27686 break;
27687 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
27688 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
27689 break;
27690 case AArch64ISD::GLDFF1_MERGE_ZERO:
27691 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
27692 break;
27693 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
27694 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
27695 break;
27696 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
27697 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
27698 break;
27699 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
27700 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
27701 break;
27702 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
27703 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
27704 break;
27705 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
27706 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
27707 break;
27708 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
27709 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
27710 break;
27711 case AArch64ISD::GLDNT1_MERGE_ZERO:
27712 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
27713 break;
27714 default:
27715 return SDValue();
27716 }
27717
27718 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
27719 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
27720
27721 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
27722 return SDValue();
27723
27724 EVT DstVT = N->getValueType(0);
27725 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
27726
27728 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
27729 Ops.push_back(Src->getOperand(I));
27730
27731 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
27732 DCI.CombineTo(N, ExtLoad);
27733 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
27734
27735 // Return N so it doesn't get rechecked
27736 return SDValue(N, 0);
27737}
27738
27739/// Legalize the gather prefetch (scalar + vector addressing mode) when the
27740/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
27741/// != nxv2i32) do not need legalization.
27743 const unsigned OffsetPos = 4;
27744 SDValue Offset = N->getOperand(OffsetPos);
27745
27746 // Not an unpacked vector, bail out.
27747 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
27748 return SDValue();
27749
27750 // Extend the unpacked offset vector to 64-bit lanes.
27751 SDLoc DL(N);
27752 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
27754 // Replace the offset operand with the 64-bit one.
27755 Ops[OffsetPos] = Offset;
27756
27757 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
27758}
27759
27760/// Combines a node carrying the intrinsic
27761/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
27762/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
27763/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
27764/// sve gather prefetch instruction with vector plus immediate addressing mode.
27766 unsigned ScalarSizeInBytes) {
27767 const unsigned ImmPos = 4, OffsetPos = 3;
27768 // No need to combine the node if the immediate is valid...
27769 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
27770 return SDValue();
27771
27772 // ...otherwise swap the offset base with the offset...
27774 std::swap(Ops[ImmPos], Ops[OffsetPos]);
27775 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
27776 // `aarch64_sve_prfb_gather_uxtw_index`.
27777 SDLoc DL(N);
27778 Ops[1] = DAG.getTargetConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index,
27779 DL, MVT::i64);
27780
27781 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
27782}
27783
27784// Return true if the vector operation can guarantee only the first lane of its
27785// result contains data, with all bits in other lanes set to zero.
27787 switch (Op.getOpcode()) {
27788 default:
27789 return false;
27790 case AArch64ISD::ANDV_PRED:
27791 case AArch64ISD::EORV_PRED:
27792 case AArch64ISD::FADDA_PRED:
27793 case AArch64ISD::FADDV_PRED:
27794 case AArch64ISD::FMAXNMV_PRED:
27795 case AArch64ISD::FMAXV_PRED:
27796 case AArch64ISD::FMINNMV_PRED:
27797 case AArch64ISD::FMINV_PRED:
27798 case AArch64ISD::ORV_PRED:
27799 case AArch64ISD::SADDV_PRED:
27800 case AArch64ISD::SMAXV_PRED:
27801 case AArch64ISD::SMINV_PRED:
27802 case AArch64ISD::UADDV_PRED:
27803 case AArch64ISD::UMAXV_PRED:
27804 case AArch64ISD::UMINV_PRED:
27805 return true;
27806 }
27807}
27808
27809// Return true if the vector operation can guarantee that the first lane of its
27810// result is active.
27812 switch (Op.getOpcode()) {
27813 default:
27814 return false;
27815 case AArch64ISD::REINTERPRET_CAST:
27816 return isLane0KnownActive(Op->getOperand(0));
27817 case ISD::SPLAT_VECTOR:
27818 return isOneConstant(Op.getOperand(0));
27819 case AArch64ISD::PTRUE:
27820 return Op.getConstantOperandVal(0) == AArch64SVEPredPattern::all;
27821 };
27822}
27823
27825 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
27826 SDValue InsertVec = N->getOperand(0);
27827 SDValue InsertElt = N->getOperand(1);
27828 SDValue InsertIdx = N->getOperand(2);
27829
27830 // We only care about inserts into the first element...
27831 if (!isNullConstant(InsertIdx))
27832 return SDValue();
27833 // ...of a zero'd vector...
27835 return SDValue();
27836 // ...where the inserted data was previously extracted...
27837 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
27838 return SDValue();
27839
27840 SDValue ExtractVec = InsertElt.getOperand(0);
27841 SDValue ExtractIdx = InsertElt.getOperand(1);
27842
27843 // ...from the first element of a vector.
27844 if (!isNullConstant(ExtractIdx))
27845 return SDValue();
27846
27847 // If we get here we are effectively trying to zero lanes 1-N of a vector.
27848
27849 // Ensure there's no type conversion going on.
27850 if (N->getValueType(0) != ExtractVec.getValueType())
27851 return SDValue();
27852
27853 if (!isLanes1toNKnownZero(ExtractVec))
27854 return SDValue();
27855
27856 // The explicit zeroing is redundant.
27857 return ExtractVec;
27858}
27859
27860static SDValue
27863 return Res;
27864
27865 return performPostLD1Combine(N, DCI, true);
27866}
27867
27870 const AArch64Subtarget *Subtarget) {
27871 SDValue N0 = N->getOperand(0);
27872 EVT VT = N->getValueType(0);
27873
27874 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
27875 if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::FP_ROUND)
27876 return SDValue();
27877
27878 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
27879 EVT EltVT = VT.getVectorElementType();
27880 return EltVT == MVT::f32 || EltVT == MVT::f64;
27881 };
27882
27883 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
27884 // We purposefully don't care about legality of the nodes here as we know
27885 // they can be split down into something legal.
27886 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
27887 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
27888 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
27889 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
27890 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
27891 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
27892 LN0->getChain(), LN0->getBasePtr(),
27893 N0.getValueType(), LN0->getMemOperand());
27894 DCI.CombineTo(N, ExtLoad);
27895 DCI.CombineTo(
27896 N0.getNode(),
27897 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
27898 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
27899 ExtLoad.getValue(1));
27900 return SDValue(N, 0); // Return N so it doesn't get rechecked!
27901 }
27902
27903 return SDValue();
27904}
27905
27907 const AArch64Subtarget *Subtarget) {
27908 EVT VT = N->getValueType(0);
27909
27910 // Don't expand for NEON, SVE2 or SME
27911 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
27912 return SDValue();
27913
27914 SDLoc DL(N);
27915
27916 SDValue Mask = N->getOperand(0);
27917 SDValue In1 = N->getOperand(1);
27918 SDValue In2 = N->getOperand(2);
27919
27920 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
27921 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
27922 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
27923 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
27924}
27925
27927 EVT VT = N->getValueType(0);
27928
27929 SDValue Insert = N->getOperand(0);
27930 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
27931 return SDValue();
27932
27933 if (!Insert.getOperand(0).isUndef())
27934 return SDValue();
27935
27936 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
27937 uint64_t IdxDupLane = N->getConstantOperandVal(1);
27938 if (IdxInsert != 0 || IdxDupLane != 0)
27939 return SDValue();
27940
27941 SDValue Bitcast = Insert.getOperand(1);
27942 if (Bitcast.getOpcode() != ISD::BITCAST)
27943 return SDValue();
27944
27945 SDValue Subvec = Bitcast.getOperand(0);
27946 EVT SubvecVT = Subvec.getValueType();
27947 if (!SubvecVT.is128BitVector())
27948 return SDValue();
27949 EVT NewSubvecVT =
27951
27952 SDLoc DL(N);
27953 SDValue NewInsert =
27954 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
27955 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
27956 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
27957 NewInsert, N->getOperand(1));
27958 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
27959}
27960
27961// Try to combine mull with uzp1.
27964 SelectionDAG &DAG) {
27965 if (DCI.isBeforeLegalizeOps())
27966 return SDValue();
27967
27968 SDValue LHS = N->getOperand(0);
27969 SDValue RHS = N->getOperand(1);
27970
27971 SDValue ExtractHigh;
27972 SDValue ExtractLow;
27973 SDValue TruncHigh;
27974 SDValue TruncLow;
27975 SDLoc DL(N);
27976
27977 // Check the operands are trunc and extract_high.
27979 RHS.getOpcode() == ISD::TRUNCATE) {
27980 TruncHigh = RHS;
27981 if (LHS.getOpcode() == ISD::BITCAST)
27982 ExtractHigh = LHS.getOperand(0);
27983 else
27984 ExtractHigh = LHS;
27986 LHS.getOpcode() == ISD::TRUNCATE) {
27987 TruncHigh = LHS;
27988 if (RHS.getOpcode() == ISD::BITCAST)
27989 ExtractHigh = RHS.getOperand(0);
27990 else
27991 ExtractHigh = RHS;
27992 } else
27993 return SDValue();
27994
27995 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
27996 // with uzp1.
27997 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
27998 SDValue TruncHighOp = TruncHigh.getOperand(0);
27999 EVT TruncHighOpVT = TruncHighOp.getValueType();
28000 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
28001 DAG.isSplatValue(TruncHighOp, false))
28002 return SDValue();
28003
28004 // Check there is other extract_high with same source vector.
28005 // For example,
28006 //
28007 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
28008 // t12: v4i16 = truncate t11
28009 // t31: v4i32 = AArch64ISD::SMULL t18, t12
28010 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
28011 // t16: v4i16 = truncate t15
28012 // t30: v4i32 = AArch64ISD::SMULL t23, t1
28013 //
28014 // This dagcombine assumes the two extract_high uses same source vector in
28015 // order to detect the pair of the mull. If they have different source vector,
28016 // this code will not work.
28017 // TODO: Should also try to look through a bitcast.
28018 bool HasFoundMULLow = true;
28019 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
28020 if (ExtractHighSrcVec->use_size() != 2)
28021 HasFoundMULLow = false;
28022
28023 // Find ExtractLow.
28024 for (SDNode *User : ExtractHighSrcVec.getNode()->users()) {
28025 if (User == ExtractHigh.getNode())
28026 continue;
28027
28028 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
28030 HasFoundMULLow = false;
28031 break;
28032 }
28033
28034 ExtractLow.setNode(User);
28035 }
28036
28037 if (!ExtractLow || !ExtractLow->hasOneUse())
28038 HasFoundMULLow = false;
28039
28040 // Check ExtractLow's user.
28041 if (HasFoundMULLow) {
28042 SDNode *ExtractLowUser = *ExtractLow.getNode()->user_begin();
28043 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
28044 HasFoundMULLow = false;
28045 } else {
28046 if (ExtractLowUser->getOperand(0) == ExtractLow) {
28047 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
28048 TruncLow = ExtractLowUser->getOperand(1);
28049 else
28050 HasFoundMULLow = false;
28051 } else {
28052 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
28053 TruncLow = ExtractLowUser->getOperand(0);
28054 else
28055 HasFoundMULLow = false;
28056 }
28057 }
28058 }
28059
28060 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
28061 // with uzp1.
28062 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
28063 EVT TruncHighVT = TruncHigh.getValueType();
28064 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
28065 SDValue TruncLowOp =
28066 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
28067 EVT TruncLowOpVT = TruncLowOp.getValueType();
28068 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
28069 DAG.isSplatValue(TruncLowOp, false)))
28070 return SDValue();
28071
28072 // Create uzp1, extract_high and extract_low.
28073 if (TruncHighOpVT != UZP1VT)
28074 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
28075 if (TruncLowOpVT != UZP1VT)
28076 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
28077
28078 SDValue UZP1 =
28079 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
28080 SDValue HighIdxCst =
28081 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
28082 SDValue NewTruncHigh =
28083 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
28084 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
28085
28086 if (HasFoundMULLow) {
28087 EVT TruncLowVT = TruncLow.getValueType();
28088 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
28089 UZP1, ExtractLow.getOperand(1));
28090 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
28091 }
28092
28093 return SDValue(N, 0);
28094}
28095
28098 SelectionDAG &DAG) {
28099 if (SDValue Val =
28101 return Val;
28102
28103 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
28104 return Val;
28105
28106 return SDValue();
28107}
28108
28111 SelectionDAG &DAG) {
28112 if (DCI.isBeforeLegalize())
28113 return SDValue();
28114
28115 SDLoc DL(N);
28116 auto Mask = N->getOperand(0);
28117 auto Pred = N->getOperand(1);
28118
28119 if (!isLane0KnownActive(Mask))
28120 return SDValue();
28121
28122 if (Pred->getOpcode() == AArch64ISD::REINTERPRET_CAST)
28123 Pred = Pred->getOperand(0);
28124
28125 if (Pred->getOpcode() == ISD::CONCAT_VECTORS) {
28126 Pred = Pred->getOperand(0);
28127 Pred = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pred);
28128 return DAG.getNode(AArch64ISD::PTEST_FIRST, DL, N->getValueType(0), Mask,
28129 Pred);
28130 }
28131
28132 return SDValue();
28133}
28134
28135static SDValue
28137 SelectionDAG &DAG) {
28138 SDLoc DL(N);
28139
28140 // If a DUP(Op0) already exists, reuse it for the scalar_to_vector.
28141 if (DCI.isAfterLegalizeDAG()) {
28142 if (SDNode *LN = DCI.DAG.getNodeIfExists(AArch64ISD::DUP, N->getVTList(),
28143 N->getOperand(0)))
28144 return SDValue(LN, 0);
28145 }
28146
28147 // Let's do below transform.
28148 //
28149 // t34: v4i32 = AArch64ISD::UADDLV t2
28150 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
28151 // t7: i64 = zero_extend t35
28152 // t20: v1i64 = scalar_to_vector t7
28153 // ==>
28154 // t34: v4i32 = AArch64ISD::UADDLV t2
28155 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
28156 // t40: v1i64 = AArch64ISD::NVCAST t39
28157 if (DCI.isBeforeLegalizeOps())
28158 return SDValue();
28159
28160 EVT VT = N->getValueType(0);
28161 if (VT != MVT::v1i64)
28162 return SDValue();
28163
28164 SDValue ZEXT = N->getOperand(0);
28165 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
28166 return SDValue();
28167
28168 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
28169 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
28170 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
28171 return SDValue();
28172
28173 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
28174 return SDValue();
28175
28176 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
28177 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
28178 UADDLV.getValueType() != MVT::v4i32 ||
28179 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
28180 return SDValue();
28181
28182 // Let's generate new sequence with AArch64ISD::NVCAST.
28183 SDValue EXTRACT_SUBVEC =
28184 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
28185 DAG.getConstant(0, DL, MVT::i64));
28186 SDValue NVCAST =
28187 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
28188
28189 return NVCAST;
28190}
28191
28194 if (!DCI.isBeforeLegalize())
28195 return SDValue();
28196
28197 unsigned NumParts = N->getNumOperands();
28198 if (NumParts != 2 && NumParts != 4)
28199 return SDValue();
28200
28201 EVT SubVecTy = N->getValueType(0);
28202
28203 // At the moment we're unlikely to see a fixed-width vector deinterleave as
28204 // we usually generate shuffles instead.
28205 unsigned MinNumElements = SubVecTy.getVectorMinNumElements();
28206 if (!SubVecTy.isScalableVector() ||
28207 SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
28208 !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
28209 return SDValue();
28210
28211 // Make sure each input operand is the correct extract_subvector of the same
28212 // wider vector.
28213 SDValue Op0 = N->getOperand(0);
28214 for (unsigned I = 0; I < NumParts; I++) {
28215 SDValue OpI = N->getOperand(I);
28216 if (OpI->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
28217 OpI->getOperand(0) != Op0->getOperand(0))
28218 return SDValue();
28219 if (OpI->getConstantOperandVal(1) != (I * MinNumElements))
28220 return SDValue();
28221 }
28222
28223 // Normal loads are currently already handled by the InterleavedAccessPass so
28224 // we don't expect to see them here. Bail out if the masked load has an
28225 // unexpected number of uses, since we want to avoid a situation where we have
28226 // both deinterleaving loads and normal loads in the same block. Also, discard
28227 // masked loads that are extending, indexed, have an unexpected offset or have
28228 // an unsupported passthru value until we find a valid use case.
28229 auto MaskedLoad = dyn_cast<MaskedLoadSDNode>(Op0->getOperand(0));
28230 if (!MaskedLoad || !MaskedLoad->hasNUsesOfValue(NumParts, 0) ||
28231 !MaskedLoad->isSimple() || !ISD::isNormalMaskedLoad(MaskedLoad) ||
28232 !MaskedLoad->getOffset().isUndef() ||
28233 (!MaskedLoad->getPassThru()->isUndef() &&
28234 !isZerosVector(MaskedLoad->getPassThru().getNode())))
28235 return SDValue();
28236
28237 // Now prove that the mask is an interleave of identical masks.
28238 SDLoc DL(N);
28239 SDValue NarrowMask =
28240 getNarrowMaskForInterleavedOps(DAG, DL, MaskedLoad->getMask(), NumParts);
28241 if (!NarrowMask)
28242 return SDValue();
28243
28244 const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret
28245 : Intrinsic::aarch64_sve_ld4_sret;
28246 SDValue NewLdOps[] = {MaskedLoad->getChain(),
28247 DAG.getConstant(IID, DL, MVT::i32), NarrowMask,
28248 MaskedLoad->getBasePtr()};
28249 SDValue Res;
28250 if (NumParts == 2)
28252 {SubVecTy, SubVecTy, MVT::Other}, NewLdOps);
28253 else
28255 {SubVecTy, SubVecTy, SubVecTy, SubVecTy, MVT::Other},
28256 NewLdOps);
28257
28258 // We can now generate a structured load!
28259 SmallVector<SDValue, 4> ResOps(NumParts);
28260 for (unsigned Idx = 0; Idx < NumParts; Idx++)
28261 ResOps[Idx] = SDValue(Res.getNode(), Idx);
28262
28263 // Replace uses of the original chain result with the new chain result.
28264 DAG.ReplaceAllUsesOfValueWith(SDValue(MaskedLoad, 1),
28265 SDValue(Res.getNode(), NumParts));
28266 return DCI.CombineTo(N, ResOps, false);
28267}
28268
28269/// If the operand is a bitwise AND with a constant RHS, and the shift has a
28270/// constant RHS and is the only use, we can pull it out of the shift, i.e.
28271///
28272/// (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
28273///
28274/// We prefer this canonical form to match existing isel patterns.
28277 SelectionDAG &DAG) {
28278 if (DCI.isBeforeLegalizeOps())
28279 return SDValue();
28280
28281 SDValue Op0 = N->getOperand(0);
28282 if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
28283 return SDValue();
28284
28285 SDValue C1 = Op0->getOperand(1);
28286 SDValue C2 = N->getOperand(1);
28288 return SDValue();
28289
28290 // Might be folded into shifted op, do not lower.
28291 if (N->hasOneUse()) {
28292 unsigned UseOpc = N->user_begin()->getOpcode();
28293 if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
28294 UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
28295 return SDValue();
28296 }
28297
28298 SDLoc DL(N);
28299 EVT VT = N->getValueType(0);
28300
28301 // Don't combine unless (shl C1, C2) can be constant folded. Otherwise,
28302 // DAGCombiner will simplify (and (op x...), (op y...)) -> (op (and x, y))
28303 // causing infinite loop. Result may also be worse.
28304 SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
28305 if (!isa<ConstantSDNode>(NewRHS))
28306 return SDValue();
28307
28308 SDValue X = Op0->getOperand(0);
28309 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
28310 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
28311}
28312
28314 unsigned IntrinsicID = N->getConstantOperandVal(1);
28315 auto Register =
28316 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
28317 : AArch64SysReg::RNDRRS);
28318 SDLoc DL(N);
28319 SDValue A = DAG.getNode(
28320 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
28321 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
28322 SDValue B = DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
28323 DAG.getConstant(0, DL, MVT::i32),
28324 DAG.getConstant(0, DL, MVT::i32),
28325 getCondCode(DAG, AArch64CC::NE), A.getValue(1));
28326 return DAG.getMergeValues(
28327 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
28328}
28329
28332 SelectionDAG &DAG) {
28333 using namespace llvm::SDPatternMatch;
28334 if (!DCI.isBeforeLegalize())
28335 return SDValue();
28336
28337 // ctpop(zext(bitcast(vector_mask))) -> neg(signed_reduce_add(vector_mask))
28338 SDValue Mask;
28339 if (!sd_match(N->getOperand(0), m_ZExt(m_BitCast(m_Value(Mask)))))
28340 return SDValue();
28341
28342 EVT VT = N->getValueType(0);
28343 EVT MaskVT = Mask.getValueType();
28344
28345 if (VT.isVector() || !MaskVT.isFixedLengthVector() ||
28346 MaskVT.getVectorElementType() != MVT::i1)
28347 return SDValue();
28348
28349 EVT ReduceInVT =
28351
28352 SDLoc DL(N);
28353 // Sign extend to best fit ZeroOrNegativeOneBooleanContent.
28354 SDValue ExtMask = DAG.getNode(ISD::SIGN_EXTEND, DL, ReduceInVT, Mask);
28355 SDValue NegPopCount = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, ExtMask);
28356 return DAG.getNegative(NegPopCount, DL, VT);
28357}
28358
28360 DAGCombinerInfo &DCI) const {
28361 SelectionDAG &DAG = DCI.DAG;
28362 switch (N->getOpcode()) {
28363 default:
28364 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
28365 break;
28367 return performVectorDeinterleaveCombine(N, DCI, DAG);
28368 case ISD::VECREDUCE_AND:
28369 case ISD::VECREDUCE_OR:
28370 case ISD::VECREDUCE_XOR:
28371 return performVecReduceBitwiseCombine(N, DCI, DAG);
28372 case ISD::ADD:
28373 case ISD::SUB:
28374 return performAddSubCombine(N, DCI);
28375 case ISD::BUILD_VECTOR:
28376 return performBuildVectorCombine(N, DCI, DAG);
28377 case ISD::SMIN:
28378 return performSMINCombine(N, DAG);
28379 case ISD::TRUNCATE:
28380 return performTruncateCombine(N, DAG, DCI);
28381 case AArch64ISD::ANDS:
28382 return performFlagSettingCombine(N, DCI, ISD::AND);
28383 case AArch64ISD::ADC:
28384 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
28385 return R;
28386 return foldADCToCINC(N, DAG);
28387 case AArch64ISD::SBC:
28388 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
28389 case AArch64ISD::ADCS:
28390 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
28391 return R;
28392 return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
28393 case AArch64ISD::SBCS:
28394 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
28395 return R;
28396 return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
28397 case AArch64ISD::ADDS:
28398 return performFlagSettingCombine(N, DCI, ISD::ADD);
28399 case AArch64ISD::SUBS:
28400 return performFlagSettingCombine(N, DCI, ISD::SUB);
28401 case AArch64ISD::BICi: {
28403 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
28404 APInt DemandedElts =
28405 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
28406
28408 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
28409 return SDValue();
28410
28411 break;
28412 }
28413 case ISD::XOR:
28414 return performXorCombine(N, DAG, DCI, Subtarget);
28415 case ISD::MUL:
28416 return performMulCombine(N, DAG, DCI, Subtarget);
28417 case ISD::SINT_TO_FP:
28418 case ISD::UINT_TO_FP:
28419 return performIntToFpCombine(N, DAG, DCI, Subtarget);
28420 case ISD::FP_TO_SINT:
28421 case ISD::FP_TO_UINT:
28424 return performFpToIntCombine(N, DAG, DCI, Subtarget);
28425 case ISD::OR:
28426 return performORCombine(N, DCI, Subtarget, *this);
28427 case ISD::AND:
28428 return performANDCombine(N, DCI);
28429 case ISD::FADD:
28430 return performFADDCombine(N, DCI);
28432 return performIntrinsicCombine(N, DCI, Subtarget);
28433 case ISD::ANY_EXTEND:
28434 case ISD::ZERO_EXTEND:
28435 case ISD::SIGN_EXTEND:
28436 return performExtendCombine(N, DCI, DAG);
28438 return performSignExtendInRegCombine(N, DCI, DAG);
28440 return performConcatVectorsCombine(N, DCI, DAG);
28442 return performExtractSubvectorCombine(N, DCI, DAG);
28444 return performInsertSubvectorCombine(N, DCI, DAG);
28445 case ISD::SELECT:
28446 return performSelectCombine(N, DCI);
28447 case ISD::VSELECT:
28448 return performVSelectCombine(N, DCI.DAG);
28449 case ISD::SETCC:
28450 return performSETCCCombine(N, DCI, DAG);
28451 case ISD::LOAD:
28452 return performLOADCombine(N, DCI, DAG, Subtarget);
28453 case ISD::STORE:
28454 return performSTORECombine(N, DCI, DAG, Subtarget);
28455 case ISD::MSTORE:
28456 return performMSTORECombine(N, DCI, DAG, Subtarget);
28457 case ISD::MGATHER:
28458 case ISD::MSCATTER:
28459 case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
28460 return performMaskedGatherScatterCombine(N, DCI, DAG);
28461 case ISD::FP_EXTEND:
28462 return performFPExtendCombine(N, DAG, DCI, Subtarget);
28463 case AArch64ISD::BRCOND:
28464 return performBRCONDCombine(N, DCI, DAG);
28465 case AArch64ISD::TBNZ:
28466 case AArch64ISD::TBZ:
28467 return performTBZCombine(N, DCI, DAG);
28468 case AArch64ISD::CSEL:
28469 return performCSELCombine(N, DCI, DAG);
28470 case AArch64ISD::DUP:
28471 case AArch64ISD::DUPLANE8:
28472 case AArch64ISD::DUPLANE16:
28473 case AArch64ISD::DUPLANE32:
28474 case AArch64ISD::DUPLANE64:
28475 return performDUPCombine(N, DCI);
28476 case AArch64ISD::DUPLANE128:
28477 return performDupLane128Combine(N, DAG);
28478 case AArch64ISD::NVCAST:
28479 return performNVCASTCombine(N, DAG);
28480 case AArch64ISD::SPLICE:
28481 return performSpliceCombine(N, DAG);
28482 case AArch64ISD::UUNPKLO:
28483 case AArch64ISD::UUNPKHI:
28484 return performUnpackCombine(N, DAG, Subtarget);
28485 case AArch64ISD::UZP1:
28486 case AArch64ISD::UZP2:
28487 return performUzpCombine(N, DAG, Subtarget);
28488 case AArch64ISD::SETCC_MERGE_ZERO:
28489 return performSetccMergeZeroCombine(N, DCI);
28490 case AArch64ISD::REINTERPRET_CAST:
28492 case AArch64ISD::GLD1_MERGE_ZERO:
28493 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
28494 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
28495 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
28496 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
28497 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
28498 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
28499 case AArch64ISD::GLD1S_MERGE_ZERO:
28500 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
28501 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
28502 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
28503 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
28504 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
28505 case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
28506 return performGLD1Combine(N, DAG);
28507 case AArch64ISD::VASHR:
28508 case AArch64ISD::VLSHR:
28509 return performVectorShiftCombine(N, *this, DCI);
28510 case AArch64ISD::SUNPKLO:
28511 return performSunpkloCombine(N, DAG);
28512 case AArch64ISD::BSP:
28513 return performBSPExpandForSVE(N, DAG, Subtarget);
28515 return performInsertVectorEltCombine(N, DCI);
28517 return performExtractVectorEltCombine(N, DCI, Subtarget);
28518 case ISD::VECREDUCE_ADD:
28519 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
28520 case ISD::GET_ACTIVE_LANE_MASK:
28521 return performActiveLaneMaskCombine(N, DCI, Subtarget);
28522 case AArch64ISD::UADDV:
28523 return performUADDVCombine(N, DAG);
28524 case AArch64ISD::SMULL:
28525 case AArch64ISD::UMULL:
28526 case AArch64ISD::PMULL:
28527 return performMULLCombine(N, DCI, DAG);
28528 case AArch64ISD::PTEST_FIRST:
28529 return performPTestFirstCombine(N, DCI, DAG);
28532 switch (N->getConstantOperandVal(1)) {
28533 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
28534 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
28535 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
28536 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
28537 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
28538 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
28539 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
28540 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
28541 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
28542 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
28543 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
28544 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
28545 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
28546 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
28547 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
28548 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
28550 case Intrinsic::aarch64_neon_ld2:
28551 case Intrinsic::aarch64_neon_ld3:
28552 case Intrinsic::aarch64_neon_ld4:
28553 case Intrinsic::aarch64_neon_ld1x2:
28554 case Intrinsic::aarch64_neon_ld1x3:
28555 case Intrinsic::aarch64_neon_ld1x4:
28556 case Intrinsic::aarch64_neon_ld2lane:
28557 case Intrinsic::aarch64_neon_ld3lane:
28558 case Intrinsic::aarch64_neon_ld4lane:
28559 case Intrinsic::aarch64_neon_ld2r:
28560 case Intrinsic::aarch64_neon_ld3r:
28561 case Intrinsic::aarch64_neon_ld4r:
28562 case Intrinsic::aarch64_neon_st2:
28563 case Intrinsic::aarch64_neon_st3:
28564 case Intrinsic::aarch64_neon_st4:
28565 case Intrinsic::aarch64_neon_st1x2:
28566 case Intrinsic::aarch64_neon_st1x3:
28567 case Intrinsic::aarch64_neon_st1x4:
28568 case Intrinsic::aarch64_neon_st2lane:
28569 case Intrinsic::aarch64_neon_st3lane:
28570 case Intrinsic::aarch64_neon_st4lane:
28571 return performNEONPostLDSTCombine(N, DCI, DAG);
28572 case Intrinsic::aarch64_sve_ldnt1:
28573 return performLDNT1Combine(N, DAG);
28574 case Intrinsic::aarch64_sve_ld1rq:
28576 case Intrinsic::aarch64_sve_ld1ro:
28578 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
28579 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
28580 case Intrinsic::aarch64_sve_ldnt1_gather:
28581 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
28582 case Intrinsic::aarch64_sve_ldnt1_gather_index:
28583 return performGatherLoadCombine(N, DAG,
28584 AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
28585 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
28586 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
28587 case Intrinsic::aarch64_sve_ld1:
28588 return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
28589 case Intrinsic::aarch64_sve_ldnf1:
28590 return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
28591 case Intrinsic::aarch64_sve_ldff1:
28592 return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
28593 case Intrinsic::aarch64_sve_st1:
28594 return performST1Combine(N, DAG);
28595 case Intrinsic::aarch64_sve_stnt1:
28596 return performSTNT1Combine(N, DAG);
28597 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
28598 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
28599 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
28600 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
28601 case Intrinsic::aarch64_sve_stnt1_scatter:
28602 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
28603 case Intrinsic::aarch64_sve_stnt1_scatter_index:
28604 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
28605 case Intrinsic::aarch64_sve_ld1_gather:
28606 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
28607 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
28608 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
28609 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
28610 case Intrinsic::aarch64_sve_ld1q_gather_index:
28611 return performGatherLoadCombine(N, DAG,
28612 AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
28613 case Intrinsic::aarch64_sve_ld1_gather_index:
28614 return performGatherLoadCombine(N, DAG,
28615 AArch64ISD::GLD1_SCALED_MERGE_ZERO);
28616 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
28617 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
28618 /*OnlyPackedOffsets=*/false);
28619 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
28620 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
28621 /*OnlyPackedOffsets=*/false);
28622 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
28623 return performGatherLoadCombine(N, DAG,
28624 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
28625 /*OnlyPackedOffsets=*/false);
28626 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
28627 return performGatherLoadCombine(N, DAG,
28628 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
28629 /*OnlyPackedOffsets=*/false);
28630 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
28631 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
28632 case Intrinsic::aarch64_sve_ldff1_gather:
28633 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
28634 case Intrinsic::aarch64_sve_ldff1_gather_index:
28635 return performGatherLoadCombine(N, DAG,
28636 AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
28637 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
28638 return performGatherLoadCombine(N, DAG,
28639 AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
28640 /*OnlyPackedOffsets=*/false);
28641 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
28642 return performGatherLoadCombine(N, DAG,
28643 AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
28644 /*OnlyPackedOffsets=*/false);
28645 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
28646 return performGatherLoadCombine(N, DAG,
28647 AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
28648 /*OnlyPackedOffsets=*/false);
28649 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
28650 return performGatherLoadCombine(N, DAG,
28651 AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
28652 /*OnlyPackedOffsets=*/false);
28653 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
28654 return performGatherLoadCombine(N, DAG,
28655 AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
28656 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
28657 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
28658 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
28659 case Intrinsic::aarch64_sve_st1q_scatter_index:
28660 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
28661 case Intrinsic::aarch64_sve_st1_scatter:
28662 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
28663 case Intrinsic::aarch64_sve_st1_scatter_index:
28664 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
28665 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
28666 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
28667 /*OnlyPackedOffsets=*/false);
28668 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
28669 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
28670 /*OnlyPackedOffsets=*/false);
28671 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
28672 return performScatterStoreCombine(N, DAG,
28673 AArch64ISD::SST1_SXTW_SCALED_PRED,
28674 /*OnlyPackedOffsets=*/false);
28675 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
28676 return performScatterStoreCombine(N, DAG,
28677 AArch64ISD::SST1_UXTW_SCALED_PRED,
28678 /*OnlyPackedOffsets=*/false);
28679 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
28680 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
28681 case Intrinsic::aarch64_rndr:
28682 case Intrinsic::aarch64_rndrrs:
28683 return performRNDRCombine(N, DAG);
28684 case Intrinsic::aarch64_sme_ldr_zt:
28685 return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),
28686 DAG.getVTList(MVT::Other), N->getOperand(0),
28687 N->getOperand(2), N->getOperand(3));
28688 case Intrinsic::aarch64_sme_str_zt:
28689 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
28690 DAG.getVTList(MVT::Other), N->getOperand(0),
28691 N->getOperand(2), N->getOperand(3));
28692 default:
28693 break;
28694 }
28695 break;
28696 case ISD::GlobalAddress:
28697 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
28698 case ISD::CTLZ:
28699 return performCTLZCombine(N, DAG, Subtarget);
28701 return performScalarToVectorCombine(N, DCI, DAG);
28702 case ISD::SHL:
28703 return performSHLCombine(N, DCI, DAG);
28704 case ISD::CTPOP:
28705 return performCTPOPCombine(N, DCI, DAG);
28706 }
28707 return SDValue();
28708}
28709
28710// Check if the return value is used as only a return value, as otherwise
28711// we can't perform a tail-call. In particular, we need to check for
28712// target ISD nodes that are returns and any other "odd" constructs
28713// that the generic analysis code won't necessarily catch.
28714bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
28715 SDValue &Chain) const {
28716 if (N->getNumValues() != 1)
28717 return false;
28718 if (!N->hasNUsesOfValue(1, 0))
28719 return false;
28720
28721 SDValue TCChain = Chain;
28722 SDNode *Copy = *N->user_begin();
28723 if (Copy->getOpcode() == ISD::CopyToReg) {
28724 // If the copy has a glue operand, we conservatively assume it isn't safe to
28725 // perform a tail call.
28726 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
28727 MVT::Glue)
28728 return false;
28729 TCChain = Copy->getOperand(0);
28730 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
28731 return false;
28732
28733 bool HasRet = false;
28734 for (SDNode *Node : Copy->users()) {
28735 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
28736 return false;
28737 HasRet = true;
28738 }
28739
28740 if (!HasRet)
28741 return false;
28742
28743 Chain = TCChain;
28744 return true;
28745}
28746
28747// Return whether the an instruction can potentially be optimized to a tail
28748// call. This will cause the optimizers to attempt to move, or duplicate,
28749// return instructions to help enable tail call optimizations for this
28750// instruction.
28751bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
28752 return CI->isTailCall();
28753}
28754
28755bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
28756 Register Offset, bool IsPre,
28757 MachineRegisterInfo &MRI) const {
28758 auto CstOffset = getIConstantVRegVal(Offset, MRI);
28759 if (!CstOffset || CstOffset->isZero())
28760 return false;
28761
28762 // All of the indexed addressing mode instructions take a signed 9 bit
28763 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
28764 // encodes the sign/indexing direction.
28765 return isInt<9>(CstOffset->getSExtValue());
28766}
28767
28768bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
28769 SDValue &Base,
28770 SDValue &Offset,
28771 SelectionDAG &DAG) const {
28772 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
28773 return false;
28774
28775 // Non-null if there is exactly one user of the loaded value (ignoring chain).
28776 SDNode *ValOnlyUser = nullptr;
28777 for (SDUse &U : N->uses()) {
28778 if (U.getResNo() == 1)
28779 continue; // Ignore chain.
28780 if (ValOnlyUser == nullptr)
28781 ValOnlyUser = U.getUser();
28782 else {
28783 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
28784 break;
28785 }
28786 }
28787
28788 auto IsUndefOrZero = [](SDValue V) {
28789 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
28790 };
28791
28792 // If the only user of the value is a scalable vector splat, it is
28793 // preferable to do a replicating load (ld1r*).
28794 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
28795 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
28796 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
28797 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
28798 return false;
28799
28800 Base = Op->getOperand(0);
28801 // All of the indexed addressing mode instructions take a signed
28802 // 9 bit immediate offset.
28803 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
28804 int64_t RHSC = RHS->getSExtValue();
28805 if (Op->getOpcode() == ISD::SUB)
28806 RHSC = -(uint64_t)RHSC;
28807 if (!isInt<9>(RHSC))
28808 return false;
28809 // When big-endian VLD1/VST1 are used for vector load and store, and these
28810 // only allow an offset that's equal to the store size.
28811 EVT MemType = cast<MemSDNode>(N)->getMemoryVT();
28812 if (!Subtarget->isLittleEndian() && MemType.isVector() &&
28813 (uint64_t)RHSC != MemType.getStoreSize())
28814 return false;
28815 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
28816 // when dealing with subtraction.
28817 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
28818 return true;
28819 }
28820 return false;
28821}
28822
28823bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
28824 SDValue &Offset,
28826 SelectionDAG &DAG) const {
28827 EVT VT;
28828 SDValue Ptr;
28829 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
28830 VT = LD->getMemoryVT();
28831 Ptr = LD->getBasePtr();
28832 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
28833 VT = ST->getMemoryVT();
28834 Ptr = ST->getBasePtr();
28835 } else
28836 return false;
28837
28838 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
28839 return false;
28840 AM = ISD::PRE_INC;
28841 return true;
28842}
28843
28844bool AArch64TargetLowering::getPostIndexedAddressParts(
28846 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
28847 EVT VT;
28848 SDValue Ptr;
28849 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
28850 VT = LD->getMemoryVT();
28851 Ptr = LD->getBasePtr();
28852 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
28853 VT = ST->getMemoryVT();
28854 Ptr = ST->getBasePtr();
28855 } else
28856 return false;
28857
28858 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
28859 return false;
28860 // Post-indexing updates the base, so it's not a valid transform
28861 // if that's not the same as the load's pointer.
28862 if (Ptr != Base)
28863 return false;
28864 AM = ISD::POST_INC;
28865 return true;
28866}
28867
28870 SelectionDAG &DAG) {
28871 SDLoc DL(N);
28872 SDValue Op = N->getOperand(0);
28873 EVT VT = N->getValueType(0);
28874 [[maybe_unused]] EVT SrcVT = Op.getValueType();
28875 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
28876 "Must be bool vector.");
28877
28878 // Special handling for Clang's __builtin_convertvector. For vectors with <8
28879 // elements, it adds a vector concatenation with undef(s). If we encounter
28880 // this here, we can skip the concat.
28881 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
28882 bool AllUndef = true;
28883 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
28884 AllUndef &= Op.getOperand(I).isUndef();
28885
28886 if (AllUndef)
28887 Op = Op.getOperand(0);
28888 }
28889
28890 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
28891 if (VectorBits)
28892 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
28893}
28894
28897 SelectionDAG &DAG, EVT ExtendVT,
28898 EVT CastVT) {
28899 SDLoc DL(N);
28900 SDValue Op = N->getOperand(0);
28901 EVT VT = N->getValueType(0);
28902
28903 // Use SCALAR_TO_VECTOR for lane zero
28904 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
28905 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
28906 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
28907 Results.push_back(
28908 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
28909}
28910
28911void AArch64TargetLowering::ReplaceBITCASTResults(
28913 SDLoc DL(N);
28914 SDValue Op = N->getOperand(0);
28915 EVT VT = N->getValueType(0);
28916 EVT SrcVT = Op.getValueType();
28917
28918 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
28919 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
28920 return;
28921 }
28922
28923 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
28924 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
28925 return;
28926 }
28927
28928 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
28929 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
28930 return;
28931 }
28932
28933 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
28934 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
28935 "Expected fp->int bitcast!");
28936
28937 // Bitcasting between unpacked vector types of different element counts is
28938 // not a NOP because the live elements are laid out differently.
28939 // 01234567
28940 // e.g. nxv2i32 = XX??XX??
28941 // nxv4f16 = X?X?X?X?
28942 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
28943 return;
28944
28945 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
28946 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
28947 return;
28948 }
28949
28950 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
28951 !VT.isVector())
28952 return replaceBoolVectorBitcast(N, Results, DAG);
28953
28954 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
28955 return;
28956
28957 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
28958 DAG.getUNDEF(MVT::i32), Op);
28959 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
28960 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
28961}
28962
28964 SelectionDAG &DAG,
28965 const AArch64Subtarget *Subtarget) {
28966 EVT VT = N->getValueType(0);
28967 if (!VT.is256BitVector() ||
28969 !N->getFlags().hasAllowReassociation()) ||
28970 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
28971 VT.getScalarType() == MVT::bf16)
28972 return;
28973
28974 SDValue X = N->getOperand(0);
28975 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
28976 if (!Shuf) {
28977 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
28978 X = N->getOperand(1);
28979 if (!Shuf)
28980 return;
28981 }
28982
28983 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
28984 return;
28985
28986 // Check the mask is 1,0,3,2,5,4,...
28987 ArrayRef<int> Mask = Shuf->getMask();
28988 for (int I = 0, E = Mask.size(); I < E; I++)
28989 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
28990 return;
28991
28992 SDLoc DL(N);
28993 auto LoHi = DAG.SplitVector(X, DL);
28994 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
28995 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
28996 LoHi.first, LoHi.second);
28997
28998 // Shuffle the elements back into order.
28999 SmallVector<int> NMask;
29000 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
29001 NMask.push_back(I);
29002 NMask.push_back(I);
29003 }
29004 Results.push_back(
29005 DAG.getVectorShuffle(VT, DL,
29006 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
29007 DAG.getUNDEF(LoHi.first.getValueType())),
29008 DAG.getUNDEF(VT), NMask));
29009}
29010
29013 SelectionDAG &DAG, unsigned InterOp,
29014 unsigned AcrossOp) {
29015 EVT LoVT, HiVT;
29016 SDValue Lo, Hi;
29017 SDLoc DL(N);
29018 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
29019 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
29020 SDValue InterVal = DAG.getNode(InterOp, DL, LoVT, Lo, Hi);
29021 SDValue SplitVal = DAG.getNode(AcrossOp, DL, LoVT, InterVal);
29022 Results.push_back(SplitVal);
29023}
29024
29025void AArch64TargetLowering::ReplaceExtractSubVectorResults(
29027 SDValue In = N->getOperand(0);
29028 EVT InVT = In.getValueType();
29029
29030 // Common code will handle these just fine.
29031 if (!InVT.isScalableVector() || !InVT.isInteger())
29032 return;
29033
29034 SDLoc DL(N);
29035 EVT VT = N->getValueType(0);
29036
29037 // The following checks bail if this is not a halving operation.
29038
29039 ElementCount ResEC = VT.getVectorElementCount();
29040
29041 if (InVT.getVectorElementCount() != (ResEC * 2))
29042 return;
29043
29044 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
29045 if (!CIndex)
29046 return;
29047
29048 unsigned Index = CIndex->getZExtValue();
29049 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
29050 return;
29051
29052 unsigned Opcode = (Index == 0) ? (unsigned)ISD::ANY_EXTEND_VECTOR_INREG
29053 : (unsigned)AArch64ISD::UUNPKHI;
29054 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
29055
29056 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
29057 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
29058}
29059
29060void AArch64TargetLowering::ReplaceGetActiveLaneMaskResults(
29062 assert((Subtarget->hasSVE2p1() ||
29063 (Subtarget->hasSME2() && Subtarget->isStreaming())) &&
29064 "Custom lower of get.active.lane.mask missing required feature.");
29065
29066 assert(N->getValueType(0) == MVT::nxv32i1 &&
29067 "Unexpected result type for get.active.lane.mask");
29068
29069 SDLoc DL(N);
29070 SDValue Idx = N->getOperand(0);
29071 SDValue TC = N->getOperand(1);
29072
29073 assert(Idx.getValueType().getFixedSizeInBits() <= 64 &&
29074 "Unexpected operand type for get.active.lane.mask");
29075
29076 if (Idx.getValueType() != MVT::i64) {
29077 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
29078 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
29079 }
29080
29081 SDValue ID =
29082 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
29083 EVT HalfVT = N->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
29084 auto WideMask =
29085 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {HalfVT, HalfVT}, {ID, Idx, TC});
29086
29087 Results.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0),
29088 {WideMask.getValue(0), WideMask.getValue(1)}));
29089}
29090
29091// Create an even/odd pair of X registers holding integer value V.
29093 SDLoc DL(V.getNode());
29094 auto [VLo, VHi] = DAG.SplitScalar(V, DL, MVT::i64, MVT::i64);
29095 if (DAG.getDataLayout().isBigEndian())
29096 std::swap (VLo, VHi);
29097 SDValue RegClass =
29098 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, DL, MVT::i32);
29099 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, DL, MVT::i32);
29100 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, DL, MVT::i32);
29101 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
29102 return SDValue(
29103 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops), 0);
29104}
29105
29108 SelectionDAG &DAG,
29109 const AArch64Subtarget *Subtarget) {
29110 assert(N->getValueType(0) == MVT::i128 &&
29111 "AtomicCmpSwap on types less than 128 should be legal");
29112
29113 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
29114 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
29115 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
29116 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
29117 SDValue Ops[] = {
29118 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
29119 createGPRPairNode(DAG, N->getOperand(3)), // Store value
29120 N->getOperand(1), // Ptr
29121 N->getOperand(0), // Chain in
29122 };
29123
29124 unsigned Opcode;
29125 switch (MemOp->getMergedOrdering()) {
29127 Opcode = AArch64::CASPX;
29128 break;
29130 Opcode = AArch64::CASPAX;
29131 break;
29133 Opcode = AArch64::CASPLX;
29134 break;
29137 Opcode = AArch64::CASPALX;
29138 break;
29139 default:
29140 llvm_unreachable("Unexpected ordering!");
29141 }
29142
29143 MachineSDNode *CmpSwap = DAG.getMachineNode(
29144 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
29145 DAG.setNodeMemRefs(CmpSwap, {MemOp});
29146
29147 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
29148 if (DAG.getDataLayout().isBigEndian())
29149 std::swap(SubReg1, SubReg2);
29150 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
29151 SDValue(CmpSwap, 0));
29152 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
29153 SDValue(CmpSwap, 0));
29154 Results.push_back(
29155 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
29156 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
29157 return;
29158 }
29159
29160 unsigned Opcode;
29161 switch (MemOp->getMergedOrdering()) {
29163 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
29164 break;
29166 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
29167 break;
29169 Opcode = AArch64::CMP_SWAP_128_RELEASE;
29170 break;
29173 Opcode = AArch64::CMP_SWAP_128;
29174 break;
29175 default:
29176 llvm_unreachable("Unexpected ordering!");
29177 }
29178
29179 SDLoc DL(N);
29180 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
29181 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
29182 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
29183 New.first, New.second, N->getOperand(0)};
29184 SDNode *CmpSwap = DAG.getMachineNode(
29185 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
29186 Ops);
29187 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
29188
29189 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
29190 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
29191 Results.push_back(SDValue(CmpSwap, 3));
29192}
29193
29194static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
29195 AtomicOrdering Ordering) {
29196 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
29197 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
29198 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
29199 // ATOMIC_LOAD_CLR at any point.
29200 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
29201 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
29202 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
29203 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
29204
29205 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
29206 // The operand will need to be XORed in a separate step.
29207 switch (Ordering) {
29209 return AArch64::LDCLRP;
29210 break;
29212 return AArch64::LDCLRPA;
29213 break;
29215 return AArch64::LDCLRPL;
29216 break;
29219 return AArch64::LDCLRPAL;
29220 break;
29221 default:
29222 llvm_unreachable("Unexpected ordering!");
29223 }
29224 }
29225
29226 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
29227 switch (Ordering) {
29229 return AArch64::LDSETP;
29230 break;
29232 return AArch64::LDSETPA;
29233 break;
29235 return AArch64::LDSETPL;
29236 break;
29239 return AArch64::LDSETPAL;
29240 break;
29241 default:
29242 llvm_unreachable("Unexpected ordering!");
29243 }
29244 }
29245
29246 if (ISDOpcode == ISD::ATOMIC_SWAP) {
29247 switch (Ordering) {
29249 return AArch64::SWPP;
29250 break;
29252 return AArch64::SWPPA;
29253 break;
29255 return AArch64::SWPPL;
29256 break;
29259 return AArch64::SWPPAL;
29260 break;
29261 default:
29262 llvm_unreachable("Unexpected ordering!");
29263 }
29264 }
29265
29266 llvm_unreachable("Unexpected ISDOpcode!");
29267}
29268
29271 SelectionDAG &DAG,
29272 const AArch64Subtarget *Subtarget) {
29273 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
29274 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
29275 // rather than the CASP instructions, because CASP has register classes for
29276 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
29277 // to present them as single operands. LSE128 instructions use the GPR64
29278 // register class (because the pair does not have to be sequential), like
29279 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
29280
29281 assert(N->getValueType(0) == MVT::i128 &&
29282 "AtomicLoadXXX on types less than 128 should be legal");
29283
29284 if (!Subtarget->hasLSE128())
29285 return;
29286
29287 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
29288 const SDValue &Chain = N->getOperand(0);
29289 const SDValue &Ptr = N->getOperand(1);
29290 const SDValue &Val128 = N->getOperand(2);
29291 std::pair<SDValue, SDValue> Val2x64 =
29292 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
29293
29294 const unsigned ISDOpcode = N->getOpcode();
29295 const unsigned MachineOpcode =
29296 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
29297
29298 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
29299 SDLoc DL(Val128);
29300 Val2x64.first =
29301 DAG.getNode(ISD::XOR, DL, MVT::i64,
29302 DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.first);
29303 Val2x64.second =
29304 DAG.getNode(ISD::XOR, DL, MVT::i64,
29305 DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.second);
29306 }
29307
29308 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
29309 if (DAG.getDataLayout().isBigEndian())
29310 std::swap(Ops[0], Ops[1]);
29311
29312 MachineSDNode *AtomicInst =
29313 DAG.getMachineNode(MachineOpcode, SDLoc(N),
29314 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
29315
29316 DAG.setNodeMemRefs(AtomicInst, {MemOp});
29317
29318 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
29319 if (DAG.getDataLayout().isBigEndian())
29320 std::swap(Lo, Hi);
29321
29322 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
29323 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
29324}
29325
29326void AArch64TargetLowering::ReplaceNodeResults(
29328 switch (N->getOpcode()) {
29329 default:
29330 llvm_unreachable("Don't know how to custom expand this");
29331 case ISD::BITCAST:
29332 ReplaceBITCASTResults(N, Results, DAG);
29333 return;
29334 case ISD::VECREDUCE_ADD:
29335 case ISD::VECREDUCE_SMAX:
29336 case ISD::VECREDUCE_SMIN:
29337 case ISD::VECREDUCE_UMAX:
29338 case ISD::VECREDUCE_UMIN:
29339 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
29340 return;
29342 if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
29343 Results.push_back(Res);
29344 return;
29345 case ISD::ADD:
29346 case ISD::FADD:
29347 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
29348 return;
29349
29350 case ISD::CTPOP:
29351 case ISD::PARITY:
29352 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
29353 Results.push_back(Result);
29354 return;
29355 case AArch64ISD::SADDV:
29356 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
29357 return;
29358 case AArch64ISD::UADDV:
29359 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
29360 return;
29361 case AArch64ISD::SMINV:
29362 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
29363 return;
29364 case AArch64ISD::UMINV:
29365 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
29366 return;
29367 case AArch64ISD::SMAXV:
29368 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
29369 return;
29370 case AArch64ISD::UMAXV:
29371 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
29372 return;
29373 case ISD::MULHS:
29375 Results.push_back(
29376 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
29377 return;
29378 case ISD::MULHU:
29380 Results.push_back(
29381 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
29382 return;
29383 case ISD::FP_TO_UINT:
29384 case ISD::FP_TO_SINT:
29387 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
29388 // Let normal code take care of it by not adding anything to Results.
29389 return;
29390 case ISD::ATOMIC_CMP_SWAP:
29391 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
29392 return;
29393 case ISD::ATOMIC_LOAD_CLR:
29394 assert(N->getValueType(0) != MVT::i128 &&
29395 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
29396 break;
29397 case ISD::ATOMIC_LOAD_AND:
29398 case ISD::ATOMIC_LOAD_OR:
29399 case ISD::ATOMIC_SWAP: {
29400 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
29401 "Expected 128-bit atomicrmw.");
29402 // These need custom type legalisation so we go directly to instruction.
29403 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
29404 return;
29405 }
29406 case ISD::ADDRSPACECAST: {
29407 SDValue V = LowerADDRSPACECAST(SDValue(N, 0), DAG);
29408 Results.push_back(V);
29409 return;
29410 }
29411 case ISD::ATOMIC_LOAD:
29412 case ISD::LOAD: {
29413 MemSDNode *LoadNode = cast<MemSDNode>(N);
29414 EVT MemVT = LoadNode->getMemoryVT();
29415 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
29416 // targets.
29417 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
29418 MemVT.getSizeInBits() == 256u &&
29419 (MemVT.getScalarSizeInBits() == 8u ||
29420 MemVT.getScalarSizeInBits() == 16u ||
29421 MemVT.getScalarSizeInBits() == 32u ||
29422 MemVT.getScalarSizeInBits() == 64u)) {
29423
29424 EVT HalfVT = MemVT.getHalfNumVectorElementsVT(*DAG.getContext());
29426 AArch64ISD::LDNP, SDLoc(N),
29427 DAG.getVTList({MVT::v2i64, MVT::v2i64, MVT::Other}),
29428 {LoadNode->getChain(), LoadNode->getBasePtr()},
29429 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
29430
29431 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
29432 DAG.getBitcast(HalfVT, Result.getValue(0)),
29433 DAG.getBitcast(HalfVT, Result.getValue(1)));
29434 Results.append({Pair, Result.getValue(2) /* Chain */});
29435 return;
29436 }
29437
29438 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
29439 LoadNode->getMemoryVT() != MVT::i128) {
29440 // Non-volatile or atomic loads are optimized later in AArch64's load/store
29441 // optimizer.
29442 return;
29443 }
29444
29445 if (SDValue(N, 0).getValueType() == MVT::i128) {
29446 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
29447 bool isLoadAcquire =
29449 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
29450
29451 if (isLoadAcquire)
29452 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
29453
29455 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
29456 {LoadNode->getChain(), LoadNode->getBasePtr()},
29457 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
29458
29459 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
29460
29461 SDValue Pair =
29462 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
29463 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
29464 Results.append({Pair, Result.getValue(2) /* Chain */});
29465 }
29466 return;
29467 }
29469 ReplaceExtractSubVectorResults(N, Results, DAG);
29470 return;
29473 // Custom lowering has been requested for INSERT_SUBVECTOR and
29474 // CONCAT_VECTORS -- but delegate to common code for result type
29475 // legalisation
29476 return;
29477 case ISD::GET_ACTIVE_LANE_MASK:
29478 ReplaceGetActiveLaneMaskResults(N, Results, DAG);
29479 return;
29481 EVT VT = N->getValueType(0);
29482
29483 Intrinsic::ID IntID =
29484 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
29485 switch (IntID) {
29486 default:
29487 return;
29488 case Intrinsic::aarch64_sve_clasta_n: {
29489 assert((VT == MVT::i8 || VT == MVT::i16) &&
29490 "custom lowering for unexpected type");
29491 SDLoc DL(N);
29492 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
29493 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
29494 N->getOperand(1), Op2, N->getOperand(3));
29495 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
29496 return;
29497 }
29498 case Intrinsic::aarch64_sve_clastb_n: {
29499 assert((VT == MVT::i8 || VT == MVT::i16) &&
29500 "custom lowering for unexpected type");
29501 SDLoc DL(N);
29502 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
29503 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
29504 N->getOperand(1), Op2, N->getOperand(3));
29505 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
29506 return;
29507 }
29508 case Intrinsic::aarch64_sve_lasta: {
29509 assert((VT == MVT::i8 || VT == MVT::i16) &&
29510 "custom lowering for unexpected type");
29511 SDLoc DL(N);
29512 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
29513 N->getOperand(1), N->getOperand(2));
29514 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
29515 return;
29516 }
29517 case Intrinsic::aarch64_sve_lastb: {
29518 assert((VT == MVT::i8 || VT == MVT::i16) &&
29519 "custom lowering for unexpected type");
29520 SDLoc DL(N);
29521 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
29522 N->getOperand(1), N->getOperand(2));
29523 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
29524 return;
29525 }
29526 case Intrinsic::aarch64_sme_in_streaming_mode: {
29527 SDLoc DL(N);
29528 SDValue Chain = DAG.getEntryNode();
29529
29530 SDValue RuntimePStateSM =
29531 getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
29532 Results.push_back(
29533 DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM));
29534 return;
29535 }
29536 case Intrinsic::experimental_vector_match: {
29537 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
29538 return;
29539
29540 // NOTE: Only trivial type promotion is supported.
29541 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
29542 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
29543 return;
29544
29545 SDLoc DL(N);
29546 auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
29547 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
29548 return;
29549 }
29550 }
29551 }
29552 case ISD::READ_REGISTER: {
29553 SDLoc DL(N);
29554 assert(N->getValueType(0) == MVT::i128 &&
29555 "READ_REGISTER custom lowering is only for 128-bit sysregs");
29556 SDValue Chain = N->getOperand(0);
29557 SDValue SysRegName = N->getOperand(1);
29558
29559 SDValue Result = DAG.getNode(
29560 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
29561 Chain, SysRegName);
29562
29563 // Sysregs are not endian. Result.getValue(0) always contains the lower half
29564 // of the 128-bit System Register value.
29565 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
29566 Result.getValue(0), Result.getValue(1));
29567 Results.push_back(Pair);
29568 Results.push_back(Result.getValue(2)); // Chain
29569 return;
29570 }
29571 }
29572}
29573
29575 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
29577 return true;
29578}
29579
29581 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
29582 // reciprocal if there are three or more FDIVs.
29583 return 3;
29584}
29585
29588 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
29589 // v4i16, v2i32 instead of to promote.
29590 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
29591 VT == MVT::v1f32)
29592 return TypeWidenVector;
29593
29595}
29596
29597// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
29598// provided the address is 16-byte aligned.
29600 if (!Subtarget->hasLSE2())
29601 return false;
29602
29603 if (auto LI = dyn_cast<LoadInst>(I))
29604 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
29605 LI->getAlign() >= Align(16);
29606
29607 if (auto SI = dyn_cast<StoreInst>(I))
29608 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
29609 SI->getAlign() >= Align(16);
29610
29611 return false;
29612}
29613
29615 if (!Subtarget->hasLSE128())
29616 return false;
29617
29618 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
29619 // will clobber the two registers.
29620 if (const auto *SI = dyn_cast<StoreInst>(I))
29621 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
29622 SI->getAlign() >= Align(16) &&
29623 (SI->getOrdering() == AtomicOrdering::Release ||
29624 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
29625
29626 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
29627 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
29628 RMW->getAlign() >= Align(16) &&
29629 (RMW->getOperation() == AtomicRMWInst::Xchg ||
29630 RMW->getOperation() == AtomicRMWInst::And ||
29631 RMW->getOperation() == AtomicRMWInst::Or);
29632
29633 return false;
29634}
29635
29637 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
29638 return false;
29639
29640 if (auto LI = dyn_cast<LoadInst>(I))
29641 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
29642 LI->getAlign() >= Align(16) &&
29643 LI->getOrdering() == AtomicOrdering::Acquire;
29644
29645 if (auto SI = dyn_cast<StoreInst>(I))
29646 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
29647 SI->getAlign() >= Align(16) &&
29648 SI->getOrdering() == AtomicOrdering::Release;
29649
29650 return false;
29651}
29652
29654 const Instruction *I) const {
29656 return false;
29658 return false;
29660 return true;
29661 return false;
29662}
29663
29665 const Instruction *I) const {
29666 // Store-Release instructions only provide seq_cst guarantees when paired with
29667 // Load-Acquire instructions. MSVC CRT does not use these instructions to
29668 // implement seq_cst loads and stores, so we need additional explicit fences
29669 // after memory writes.
29670 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
29671 return false;
29672
29673 switch (I->getOpcode()) {
29674 default:
29675 return false;
29676 case Instruction::AtomicCmpXchg:
29677 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
29679 case Instruction::AtomicRMW:
29680 return cast<AtomicRMWInst>(I)->getOrdering() ==
29682 case Instruction::Store:
29683 return cast<StoreInst>(I)->getOrdering() ==
29685 }
29686}
29687
29688// Loads and stores less than 128-bits are already atomic; ones above that
29689// are doomed anyway, so defer to the default libcall and blame the OS when
29690// things go wrong.
29693 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
29694 if (Size != 128)
29703}
29704
29705// Loads and stores less than 128-bits are already atomic; ones above that
29706// are doomed anyway, so defer to the default libcall and blame the OS when
29707// things go wrong.
29710 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
29711
29712 if (Size != 128)
29714 if (isOpSuitableForRCPC3(LI))
29716 // No LSE128 loads
29717 if (isOpSuitableForLDPSTP(LI))
29719
29720 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
29721 // implement atomicrmw without spilling. If the target address is also on the
29722 // stack and close enough to the spill slot, this can lead to a situation
29723 // where the monitor always gets cleared and the atomic operation can never
29724 // succeed. So at -O0 lower this operation to a CAS loop.
29725 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
29727
29728 // Using CAS for an atomic load has a better chance of succeeding under high
29729 // contention situations. So use it if available.
29730 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
29732}
29733
29734// Return true if the atomic operation expansion will lower to use a library
29735// call, and is thus ineligible to use an LLSC expansion.
29736static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
29737 const AtomicRMWInst *RMW) {
29738 if (!RMW->isFloatingPointOperation())
29739 return false;
29740 switch (RMW->getType()->getScalarType()->getTypeID()) {
29741 case Type::FloatTyID:
29742 case Type::DoubleTyID:
29743 case Type::HalfTyID:
29744 case Type::BFloatTyID:
29745 // Will use soft float
29746 return !Subtarget.hasFPARMv8();
29747 default:
29748 // fp128 will emit library calls.
29749 return true;
29750 }
29751
29752 llvm_unreachable("covered type switch");
29753}
29754
29755// The "default" for integer RMW operations is to expand to an LL/SC loop.
29756// However, with the LSE instructions (or outline-atomics mode, which provides
29757// library routines in place of the LSE-instructions), we can directly emit many
29758// operations instead.
29761 Type *Ty = AI->getType();
29762 unsigned Size = Ty->getPrimitiveSizeInBits();
29763 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
29764
29765 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
29769 if (CanUseLSE128)
29771
29772 // If LSFE available, use atomic FP instructions in preference to expansion
29773 if (Subtarget->hasLSFE() && (AI->getOperation() == AtomicRMWInst::FAdd ||
29779
29780 // Leave 128 bits to LLSC or CmpXChg.
29781 if (Size < 128 && !AI->isFloatingPointOperation()) {
29782 if (Subtarget->hasLSE()) {
29783 // Nand is not supported in LSE.
29784 switch (AI->getOperation()) {
29786 case AtomicRMWInst::Add:
29787 case AtomicRMWInst::Sub:
29788 case AtomicRMWInst::And:
29789 case AtomicRMWInst::Or:
29790 case AtomicRMWInst::Xor:
29791 case AtomicRMWInst::Max:
29792 case AtomicRMWInst::Min:
29796 default:
29797 break;
29798 }
29799 }
29800 if (Subtarget->outlineAtomics()) {
29801 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
29802 // Don't outline them unless
29803 // (1) high level <atomic> support approved:
29804 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
29805 // (2) low level libgcc and compiler-rt support implemented by:
29806 // min/max outline atomics helpers
29807 switch (AI->getOperation()) {
29809 case AtomicRMWInst::Add:
29810 case AtomicRMWInst::Sub:
29811 case AtomicRMWInst::And:
29812 case AtomicRMWInst::Or:
29813 case AtomicRMWInst::Xor:
29815 default:
29816 break;
29817 }
29818 }
29819 }
29820
29821 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
29822 // implement atomicrmw without spilling. If the target address is also on the
29823 // stack and close enough to the spill slot, this can lead to a situation
29824 // where the monitor always gets cleared and the atomic operation can never
29825 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
29826 // we have a single CAS instruction that can replace the loop.
29827 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||
29828 Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI))
29830
29832}
29833
29836 AtomicCmpXchgInst *AI) const {
29837 // If subtarget has LSE, leave cmpxchg intact for codegen.
29838 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
29840 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
29841 // implement cmpxchg without spilling. If the address being exchanged is also
29842 // on the stack and close enough to the spill slot, this can lead to a
29843 // situation where the monitor always gets cleared and the atomic operation
29844 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
29845 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
29847
29848 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
29849 // it.
29851 if (Size > 64)
29853
29855}
29856
29858 Type *ValueTy, Value *Addr,
29859 AtomicOrdering Ord) const {
29860 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29861 bool IsAcquire = isAcquireOrStronger(Ord);
29862
29863 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
29864 // intrinsic must return {i64, i64} and we have to recombine them into a
29865 // single i128 here.
29866 if (ValueTy->getPrimitiveSizeInBits() == 128) {
29868 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
29869
29870 Value *LoHi =
29871 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
29872
29873 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
29874 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
29875
29876 auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
29877 Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
29878 Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
29879
29880 Value *Or = Builder.CreateOr(
29881 Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
29882 return Builder.CreateBitCast(Or, ValueTy);
29883 }
29884
29885 Type *Tys[] = { Addr->getType() };
29887 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
29888
29889 const DataLayout &DL = M->getDataLayout();
29890 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
29891 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
29892 CI->addParamAttr(0, Attribute::get(Builder.getContext(),
29893 Attribute::ElementType, IntEltTy));
29894 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
29895
29896 return Builder.CreateBitCast(Trunc, ValueTy);
29897}
29898
29900 IRBuilderBase &Builder) const {
29901 Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {});
29902}
29903
29905 Value *Val, Value *Addr,
29906 AtomicOrdering Ord) const {
29907 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29908 bool IsRelease = isReleaseOrStronger(Ord);
29909
29910 // Since the intrinsics must have legal type, the i128 intrinsics take two
29911 // parameters: "i64, i64". We must marshal Val into the appropriate form
29912 // before the call.
29913 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
29915 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
29917 Type *Int64Ty = Type::getInt64Ty(M->getContext());
29918 Type *Int128Ty = Type::getInt128Ty(M->getContext());
29919
29920 Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
29921
29922 Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
29923 Value *Hi =
29924 Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
29925 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
29926 }
29927
29929 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
29930 Type *Tys[] = { Addr->getType() };
29932
29933 const DataLayout &DL = M->getDataLayout();
29934 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
29935 Val = Builder.CreateBitCast(Val, IntValTy);
29936
29937 CallInst *CI = Builder.CreateCall(
29938 Stxr, {Builder.CreateZExtOrBitCast(
29939 Val, Stxr->getFunctionType()->getParamType(0)),
29940 Addr});
29941 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
29942 Attribute::ElementType, Val->getType()));
29943 return CI;
29944}
29945
29947 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
29948 const DataLayout &DL) const {
29949 if (!Ty->isArrayTy()) {
29950 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
29951 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
29952 }
29953
29954 // All non aggregate members of the type must have the same type
29955 SmallVector<EVT> ValueVTs;
29956 ComputeValueVTs(*this, DL, Ty, ValueVTs);
29957 return all_equal(ValueVTs);
29958}
29959
29960bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
29961 EVT) const {
29962 return false;
29963}
29964
29965static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
29966 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
29967 Function *ThreadPointerFunc = Intrinsic::getOrInsertDeclaration(
29968 M, Intrinsic::thread_pointer, IRB.getPtrTy());
29969 return IRB.CreatePointerCast(
29970 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
29971 Offset),
29972 IRB.getPtrTy(0));
29973}
29974
29976 // Android provides a fixed TLS slot for the stack cookie. See the definition
29977 // of TLS_SLOT_STACK_GUARD in
29978 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
29979 if (Subtarget->isTargetAndroid())
29980 return UseTlsOffset(IRB, 0x28);
29981
29982 // Fuchsia is similar.
29983 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
29984 if (Subtarget->isTargetFuchsia())
29985 return UseTlsOffset(IRB, -0x10);
29986
29988}
29989
29991 // MSVC CRT provides functionalities for stack protection.
29992 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
29993 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
29994
29995 RTLIB::LibcallImpl SecurityCookieVar =
29996 getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
29997 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
29998 SecurityCookieVar != RTLIB::Unsupported) {
29999 // MSVC CRT has a global variable holding security cookie.
30000 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
30001 PointerType::getUnqual(M.getContext()));
30002
30003 // MSVC CRT has a function to validate security cookie.
30004 FunctionCallee SecurityCheckCookie =
30005 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
30006 Type::getVoidTy(M.getContext()),
30007 PointerType::getUnqual(M.getContext()));
30008 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
30009 F->setCallingConv(CallingConv::Win64);
30010 F->addParamAttr(0, Attribute::AttrKind::InReg);
30011 }
30012 return;
30013 }
30015}
30016
30017Value *
30019 // Android provides a fixed TLS slot for the SafeStack pointer. See the
30020 // definition of TLS_SLOT_SAFESTACK in
30021 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
30022 if (Subtarget->isTargetAndroid())
30023 return UseTlsOffset(IRB, 0x48);
30024
30026}
30027
30028/// If a physical register, this returns the register that receives the
30029/// exception address on entry to an EH pad.
30031 const Constant *PersonalityFn) const {
30032 // FIXME: This is a guess. Has this been defined yet?
30033 return AArch64::X0;
30034}
30035
30036/// If a physical register, this returns the register that receives the
30037/// exception typeid on entry to a landing pad.
30039 const Constant *PersonalityFn) const {
30040 // FIXME: This is a guess. Has this been defined yet?
30041 return AArch64::X1;
30042}
30043
30045 const Instruction &AndI) const {
30046 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
30047 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
30048 // may be beneficial to sink in other cases, but we would have to check that
30049 // the cmp would not get folded into the br to form a cbz for these to be
30050 // beneficial.
30052 if (!Mask)
30053 return false;
30054 return Mask->getValue().isPowerOf2();
30055}
30056
30060 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
30061 SelectionDAG &DAG) const {
30062 // Does baseline recommend not to perform the fold by default?
30064 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
30065 return false;
30066 // Else, if this is a vector shift, prefer 'shl'.
30067 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
30068}
30069
30072 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
30074 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
30077 ExpansionFactor);
30078}
30079
30081 // Update IsSplitCSR in AArch64unctionInfo.
30082 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
30083 AFI->setIsSplitCSR(true);
30084}
30085
30087 MachineBasicBlock *Entry,
30088 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
30089 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
30090 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
30091 if (!IStart)
30092 return;
30093
30094 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
30095 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
30096 MachineBasicBlock::iterator MBBI = Entry->begin();
30097 for (const MCPhysReg *I = IStart; *I; ++I) {
30098 const TargetRegisterClass *RC = nullptr;
30099 if (AArch64::GPR64RegClass.contains(*I))
30100 RC = &AArch64::GPR64RegClass;
30101 else if (AArch64::FPR64RegClass.contains(*I))
30102 RC = &AArch64::FPR64RegClass;
30103 else
30104 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
30105
30106 Register NewVR = MRI->createVirtualRegister(RC);
30107 // Create copy from CSR to a virtual register.
30108 // FIXME: this currently does not emit CFI pseudo-instructions, it works
30109 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
30110 // nounwind. If we want to generalize this later, we may need to emit
30111 // CFI pseudo-instructions.
30112 assert(Entry->getParent()->getFunction().hasFnAttribute(
30113 Attribute::NoUnwind) &&
30114 "Function should be nounwind in insertCopiesSplitCSR!");
30115 Entry->addLiveIn(*I);
30116 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
30117 .addReg(*I);
30118
30119 // Insert the copy-back instructions right before the terminator.
30120 for (auto *Exit : Exits)
30121 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
30122 TII->get(TargetOpcode::COPY), *I)
30123 .addReg(NewVR);
30124 }
30125}
30126
30127bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
30128 // Integer division on AArch64 is expensive. However, when aggressively
30129 // optimizing for code size, we prefer to use a div instruction, as it is
30130 // usually smaller than the alternative sequence.
30131 // The exception to this is vector division. Since AArch64 doesn't have vector
30132 // integer division, leaving the division as-is is a loss even in terms of
30133 // size, because it will have to be scalarized, while the alternative code
30134 // sequence can be performed in vector form.
30135 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
30136 return OptSize && !VT.isVector();
30137}
30138
30140 const MachineFunction &MF) const {
30141 // Avoid merging stores into fixed-length vectors when Neon is unavailable.
30142 // In future, we could allow this when SVE is available, but currently,
30143 // the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and
30144 // the general lowering may introduce stack spills/reloads).
30145 if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
30146 return false;
30147
30148 // Do not merge to float value size (128 bytes) if no implicit float attribute
30149 // is set.
30150 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
30151 return !NoFloat || MemVT.getSizeInBits() <= 64;
30152}
30153
30155 // We want inc-of-add for scalars and sub-of-not for vectors.
30156 return VT.isScalarInteger();
30157}
30158
30160 EVT VT) const {
30161 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
30162 // legalize.
30163 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
30164 return false;
30165 if (FPVT == MVT::v8bf16)
30166 return false;
30167 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
30168}
30169
30171 // Expand scalar and SVE operations using selects. Neon vectors prefer sub to
30172 // avoid vselect becoming bsl / unrolling.
30173 return !VT.isFixedLengthVector();
30174}
30175
30179 const TargetInstrInfo *TII) const {
30180 assert(MBBI->isCall() && MBBI->getCFIType() &&
30181 "Invalid call instruction for a KCFI check");
30182
30183 switch (MBBI->getOpcode()) {
30184 case AArch64::BLR:
30185 case AArch64::BLRNoIP:
30186 case AArch64::TCRETURNri:
30187 case AArch64::TCRETURNrix16x17:
30188 case AArch64::TCRETURNrix17:
30189 case AArch64::TCRETURNrinotx16:
30190 break;
30191 default:
30192 llvm_unreachable("Unexpected CFI call opcode");
30193 }
30194
30195 MachineOperand &Target = MBBI->getOperand(0);
30196 assert(Target.isReg() && "Invalid target operand for an indirect call");
30197 Target.setIsRenamable(false);
30198
30199 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
30200 .addReg(Target.getReg())
30201 .addImm(MBBI->getCFIType())
30202 .getInstr();
30203}
30204
30206 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
30207}
30208
30209unsigned
30211 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
30212 return getPointerTy(DL).getSizeInBits();
30213
30214 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
30215}
30216
30217void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
30218 MachineFrameInfo &MFI = MF.getFrameInfo();
30219 // If we have any vulnerable SVE stack objects then the stack protector
30220 // needs to be placed at the top of the SVE stack area, as the SVE locals
30221 // are placed above the other locals, so we allocate it as if it were a
30222 // scalable vector.
30223 // FIXME: It may be worthwhile having a specific interface for this rather
30224 // than doing it here in finalizeLowering.
30225 if (MFI.hasStackProtectorIndex()) {
30226 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
30227 if (MFI.hasScalableStackID(i) &&
30232 break;
30233 }
30234 }
30235 }
30238}
30239
30240// Unlike X86, we let frame lowering assign offsets to all catch objects.
30242
30243bool AArch64TargetLowering::shouldLocalize(
30244 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
30245 auto &MF = *MI.getMF();
30246 auto &MRI = MF.getRegInfo();
30247 auto maxUses = [](unsigned RematCost) {
30248 // A cost of 1 means remats are basically free.
30249 if (RematCost == 1)
30250 return std::numeric_limits<unsigned>::max();
30251 if (RematCost == 2)
30252 return 2U;
30253
30254 // Remat is too expensive, only sink if there's one user.
30255 if (RematCost > 2)
30256 return 1U;
30257 llvm_unreachable("Unexpected remat cost");
30258 };
30259
30260 unsigned Opc = MI.getOpcode();
30261 switch (Opc) {
30262 case TargetOpcode::G_GLOBAL_VALUE: {
30263 // On Darwin, TLS global vars get selected into function calls, which
30264 // we don't want localized, as they can get moved into the middle of a
30265 // another call sequence.
30266 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
30267 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
30268 return false;
30269 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
30270 }
30271 case TargetOpcode::G_FCONSTANT:
30272 case TargetOpcode::G_CONSTANT: {
30273 const ConstantInt *CI;
30274 unsigned AdditionalCost = 0;
30275
30276 if (Opc == TargetOpcode::G_CONSTANT)
30277 CI = MI.getOperand(1).getCImm();
30278 else {
30279 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
30280 // We try to estimate cost of 32/64b fpimms, as they'll likely be
30281 // materialized as integers.
30282 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
30283 break;
30284 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
30285 bool OptForSize = MF.getFunction().hasOptSize();
30287 OptForSize))
30288 return true; // Constant should be cheap.
30289 CI =
30290 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
30291 // FP materialization also costs an extra move, from gpr to fpr.
30292 AdditionalCost = 1;
30293 }
30294 APInt Imm = CI->getValue();
30297 assert(Cost.isValid() && "Expected a valid imm cost");
30298
30299 unsigned RematCost = Cost.getValue();
30300 RematCost += AdditionalCost;
30301 Register Reg = MI.getOperand(0).getReg();
30302 unsigned MaxUses = maxUses(RematCost);
30303 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
30304 if (MaxUses == std::numeric_limits<unsigned>::max())
30305 --MaxUses;
30306 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
30307 }
30308 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
30309 // localizable.
30310 case AArch64::ADRP:
30311 case AArch64::G_ADD_LOW:
30312 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
30313 case TargetOpcode::G_PTR_ADD:
30314 return true;
30315 default:
30316 break;
30317 }
30319}
30320
30322 // Fallback for scalable vectors.
30323 // Note that if EnableSVEGISel is true, we allow scalable vector types for
30324 // all instructions, regardless of whether they are actually supported.
30325 if (!EnableSVEGISel) {
30326 if (Inst.getType()->isScalableTy()) {
30327 return true;
30328 }
30329
30330 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
30331 if (Inst.getOperand(i)->getType()->isScalableTy())
30332 return true;
30333
30334 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
30335 if (AI->getAllocatedType()->isScalableTy())
30336 return true;
30337 }
30338 }
30339
30340 // Checks to allow the use of SME instructions
30341 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
30342 auto CallAttrs = SMECallAttrs(*Base, &getRuntimeLibcallsInfo());
30343 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
30344 CallAttrs.requiresPreservingZT0() ||
30345 CallAttrs.requiresPreservingAllZAState())
30346 return true;
30347 }
30348 return false;
30349}
30350
30351// Return the largest legal scalable vector type that matches VT's element type.
30355 "Expected legal fixed length vector!");
30356 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
30357 default:
30358 llvm_unreachable("unexpected element type for SVE container");
30359 case MVT::i8:
30360 return EVT(MVT::nxv16i8);
30361 case MVT::i16:
30362 return EVT(MVT::nxv8i16);
30363 case MVT::i32:
30364 return EVT(MVT::nxv4i32);
30365 case MVT::i64:
30366 return EVT(MVT::nxv2i64);
30367 case MVT::bf16:
30368 return EVT(MVT::nxv8bf16);
30369 case MVT::f16:
30370 return EVT(MVT::nxv8f16);
30371 case MVT::f32:
30372 return EVT(MVT::nxv4f32);
30373 case MVT::f64:
30374 return EVT(MVT::nxv2f64);
30375 }
30376}
30377
30378// Return a predicate with active lanes corresponding to the extent of VT.
30380 EVT VT) {
30383 "Expected legal fixed length vector!");
30384
30385 std::optional<unsigned> PgPattern =
30387 assert(PgPattern && "Unexpected element count for SVE predicate");
30388
30389 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
30390 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
30391 // variants of instructions when available.
30392 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
30393 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
30394 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
30395 if (MaxSVESize && MinSVESize == MaxSVESize &&
30396 MaxSVESize == VT.getSizeInBits())
30397 PgPattern = AArch64SVEPredPattern::all;
30398
30399 MVT MaskVT;
30400 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
30401 default:
30402 llvm_unreachable("unexpected element type for SVE predicate");
30403 case MVT::i8:
30404 MaskVT = MVT::nxv16i1;
30405 break;
30406 case MVT::i16:
30407 case MVT::f16:
30408 case MVT::bf16:
30409 MaskVT = MVT::nxv8i1;
30410 break;
30411 case MVT::i32:
30412 case MVT::f32:
30413 MaskVT = MVT::nxv4i1;
30414 break;
30415 case MVT::i64:
30416 case MVT::f64:
30417 MaskVT = MVT::nxv2i1;
30418 break;
30419 }
30420
30421 return getPTrue(DAG, DL, MaskVT, *PgPattern);
30422}
30423
30425 EVT VT) {
30427 "Expected legal scalable vector!");
30428 auto PredTy = VT.changeVectorElementType(MVT::i1);
30429 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
30430}
30431
30433 if (VT.isFixedLengthVector())
30434 return getPredicateForFixedLengthVector(DAG, DL, VT);
30435
30436 return getPredicateForScalableVector(DAG, DL, VT);
30437}
30438
30439// Grow V to consume an entire SVE register.
30441 assert(VT.isScalableVector() &&
30442 "Expected to convert into a scalable vector!");
30443 assert(V.getValueType().isFixedLengthVector() &&
30444 "Expected a fixed length vector operand!");
30445 SDLoc DL(V);
30446 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
30447 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
30448}
30449
30450// Shrink V so it's just big enough to maintain a VT's worth of data.
30453 "Expected to convert into a fixed length vector!");
30454 assert(V.getValueType().isScalableVector() &&
30455 "Expected a scalable vector operand!");
30456 SDLoc DL(V);
30457 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
30458 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
30459}
30460
30461// Convert all fixed length vector loads larger than NEON to masked_loads.
30462SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
30463 SDValue Op, SelectionDAG &DAG) const {
30464 auto Load = cast<LoadSDNode>(Op);
30465
30466 SDLoc DL(Op);
30467 EVT VT = Op.getValueType();
30468 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30469 EVT LoadVT = ContainerVT;
30470 EVT MemVT = Load->getMemoryVT();
30471
30472 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
30473
30474 if (VT.isFloatingPoint()) {
30475 LoadVT = ContainerVT.changeTypeToInteger();
30476 MemVT = MemVT.changeTypeToInteger();
30477 }
30478
30479 SDValue NewLoad = DAG.getMaskedLoad(
30480 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
30481 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
30482 Load->getAddressingMode(), Load->getExtensionType());
30483
30484 SDValue Result = NewLoad;
30485 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
30486 EVT ExtendVT = ContainerVT.changeVectorElementType(
30487 Load->getMemoryVT().getVectorElementType());
30488
30489 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
30490 Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
30491 Pg, Result, DAG.getUNDEF(ContainerVT));
30492 } else if (VT.isFloatingPoint()) {
30493 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
30494 }
30495
30496 Result = convertFromScalableVector(DAG, VT, Result);
30497 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
30498 return DAG.getMergeValues(MergedValues, DL);
30499}
30500
30502 SelectionDAG &DAG) {
30503 SDLoc DL(Mask);
30504 EVT InVT = Mask.getValueType();
30505 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30507
30508 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
30509 return Pg;
30510
30511 bool InvertCond = false;
30512 if (isBitwiseNot(Mask)) {
30513 InvertCond = true;
30514 Mask = Mask.getOperand(0);
30515 }
30516
30517 SDValue Op1, Op2;
30518 ISD::CondCode CC;
30519
30520 // When Mask is the result of a SETCC, it's better to regenerate the compare.
30521 if (Mask.getOpcode() == ISD::SETCC) {
30522 Op1 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(0));
30523 Op2 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(1));
30524 CC = cast<CondCodeSDNode>(Mask.getOperand(2))->get();
30525 } else {
30526 Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
30527 Op2 = DAG.getConstant(0, DL, ContainerVT);
30528 CC = ISD::SETNE;
30529 }
30530
30531 if (InvertCond)
30532 CC = getSetCCInverse(CC, Op1.getValueType());
30533
30534 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
30535 {Pg, Op1, Op2, DAG.getCondCode(CC)});
30536}
30537
30538// Convert all fixed length vector loads larger than NEON to masked_loads.
30539SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
30540 SDValue Op, SelectionDAG &DAG) const {
30542
30543 SDLoc DL(Op);
30544 EVT VT = Op.getValueType();
30545 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30546
30547 SDValue Mask = Load->getMask();
30548 // If this is an extending load and the mask type is not the same as
30549 // load's type then we have to extend the mask type.
30550 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
30551 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
30552 "Incorrect mask type");
30553 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Mask);
30554 }
30556
30557 SDValue PassThru;
30558 bool IsPassThruZeroOrUndef = false;
30559
30560 if (Load->getPassThru()->isUndef()) {
30561 PassThru = DAG.getUNDEF(ContainerVT);
30562 IsPassThruZeroOrUndef = true;
30563 } else {
30564 if (ContainerVT.isInteger())
30565 PassThru = DAG.getConstant(0, DL, ContainerVT);
30566 else
30567 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
30568 if (isZerosVector(Load->getPassThru().getNode()))
30569 IsPassThruZeroOrUndef = true;
30570 }
30571
30572 SDValue NewLoad = DAG.getMaskedLoad(
30573 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
30574 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
30575 Load->getAddressingMode(), Load->getExtensionType());
30576
30577 SDValue Result = NewLoad;
30578 if (!IsPassThruZeroOrUndef) {
30579 SDValue OldPassThru =
30580 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
30581 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
30582 }
30583
30584 Result = convertFromScalableVector(DAG, VT, Result);
30585 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
30586 return DAG.getMergeValues(MergedValues, DL);
30587}
30588
30589// Convert all fixed length vector stores larger than NEON to masked_stores.
30590SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
30591 SDValue Op, SelectionDAG &DAG) const {
30592 auto Store = cast<StoreSDNode>(Op);
30593
30594 SDLoc DL(Op);
30595 EVT VT = Store->getValue().getValueType();
30596 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30597 EVT MemVT = Store->getMemoryVT();
30598
30599 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
30600 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
30601
30602 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
30603 EVT TruncVT = ContainerVT.changeVectorElementType(
30604 Store->getMemoryVT().getVectorElementType());
30605 MemVT = MemVT.changeTypeToInteger();
30606 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
30607 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
30608 DAG.getUNDEF(TruncVT));
30609 NewValue =
30610 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
30611 } else if (VT.isFloatingPoint()) {
30612 MemVT = MemVT.changeTypeToInteger();
30613 NewValue =
30614 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
30615 }
30616
30617 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
30618 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
30619 Store->getMemOperand(), Store->getAddressingMode(),
30620 Store->isTruncatingStore());
30621}
30622
30623SDValue AArch64TargetLowering::LowerMSTORE(SDValue Op,
30624 SelectionDAG &DAG) const {
30625 SDLoc DL(Op);
30627 EVT VT = Store->getValue().getValueType();
30628 if (VT.isFixedLengthVector())
30629 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
30630
30631 if (!Store->isCompressingStore())
30632 return SDValue();
30633
30634 EVT MaskVT = Store->getMask().getValueType();
30635 EVT MaskExtVT = getPromotedVTForPredicate(MaskVT);
30636 EVT MaskReduceVT = MaskExtVT.getScalarType();
30637 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
30638
30639 SDValue MaskExt =
30640 DAG.getNode(ISD::ZERO_EXTEND, DL, MaskExtVT, Store->getMask());
30641 SDValue CntActive =
30642 DAG.getNode(ISD::VECREDUCE_ADD, DL, MaskReduceVT, MaskExt);
30643 if (MaskReduceVT != MVT::i64)
30644 CntActive = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CntActive);
30645
30646 SDValue CompressedValue =
30647 DAG.getNode(ISD::VECTOR_COMPRESS, DL, VT, Store->getValue(),
30648 Store->getMask(), DAG.getPOISON(VT));
30649 SDValue CompressedMask =
30650 DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT, Zero, CntActive);
30651
30652 return DAG.getMaskedStore(Store->getChain(), DL, CompressedValue,
30653 Store->getBasePtr(), Store->getOffset(),
30654 CompressedMask, Store->getMemoryVT(),
30655 Store->getMemOperand(), Store->getAddressingMode(),
30656 Store->isTruncatingStore(),
30657 /*isCompressing=*/false);
30658}
30659
30660SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
30661 SDValue Op, SelectionDAG &DAG) const {
30663
30664 SDLoc DL(Op);
30665 EVT VT = Store->getValue().getValueType();
30666 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30667
30668 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
30670
30671 return DAG.getMaskedStore(
30672 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
30673 Mask, Store->getMemoryVT(), Store->getMemOperand(),
30674 Store->getAddressingMode(), Store->isTruncatingStore(),
30675 Store->isCompressingStore());
30676}
30677
30678SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
30679 SDValue Op, SelectionDAG &DAG) const {
30680 SDLoc DL(Op);
30681 EVT VT = Op.getValueType();
30682 EVT EltVT = VT.getVectorElementType();
30683
30684 bool Signed = Op.getOpcode() == ISD::SDIV;
30685 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
30686
30687 bool Negated;
30688 uint64_t SplatVal;
30689 // NOTE: SRAD cannot be used to represent sdiv-by-one.
30690 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) &&
30691 SplatVal > 1) {
30692 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30693 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
30694 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32);
30695
30697 SDValue Res =
30698 DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, DL, ContainerVT, Pg, Op1, Op2);
30699 if (Negated)
30700 Res = DAG.getNode(ISD::SUB, DL, ContainerVT,
30701 DAG.getConstant(0, DL, ContainerVT), Res);
30702
30703 return convertFromScalableVector(DAG, VT, Res);
30704 }
30705
30706 // Scalable vector i32/i64 DIV is supported.
30707 if (EltVT == MVT::i32 || EltVT == MVT::i64)
30708 return LowerToPredicatedOp(Op, DAG, PredOpcode);
30709
30710 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
30711 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
30712 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
30713 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30714
30715 // If the wider type is legal: extend, op, and truncate.
30716 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
30717 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
30718 SDValue Op0 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(0));
30719 SDValue Op1 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(1));
30720 SDValue Div = DAG.getNode(Op.getOpcode(), DL, WideVT, Op0, Op1);
30721 return DAG.getNode(ISD::TRUNCATE, DL, VT, Div);
30722 }
30723
30724 auto HalveAndExtendVector = [&DAG, &DL, &HalfVT, &PromVT,
30725 &ExtendOpcode](SDValue Op) {
30726 SDValue IdxZero = DAG.getConstant(0, DL, MVT::i64);
30727 SDValue IdxHalf =
30728 DAG.getConstant(HalfVT.getVectorNumElements(), DL, MVT::i64);
30729 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxZero);
30730 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxHalf);
30731 return std::pair<SDValue, SDValue>(
30732 {DAG.getNode(ExtendOpcode, DL, PromVT, Lo),
30733 DAG.getNode(ExtendOpcode, DL, PromVT, Hi)});
30734 };
30735
30736 // If wider type is not legal: split, extend, op, trunc and concat.
30737 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
30738 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
30739 SDValue Lo = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0LoExt, Op1LoExt);
30740 SDValue Hi = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0HiExt, Op1HiExt);
30741 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Lo);
30742 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Hi);
30743 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoTrunc, HiTrunc});
30744}
30745
30746SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
30747 SDValue Op, SelectionDAG &DAG) const {
30748 EVT VT = Op.getValueType();
30749 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30750
30751 SDLoc DL(Op);
30752 SDValue Val = Op.getOperand(0);
30753 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
30754 Val = convertToScalableVector(DAG, ContainerVT, Val);
30755
30756 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
30757 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
30758
30759 // Repeatedly unpack Val until the result is of the desired element type.
30760 switch (ContainerVT.getSimpleVT().SimpleTy) {
30761 default:
30762 llvm_unreachable("unimplemented container type");
30763 case MVT::nxv16i8:
30764 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
30765 if (VT.getVectorElementType() == MVT::i16)
30766 break;
30767 [[fallthrough]];
30768 case MVT::nxv8i16:
30769 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
30770 if (VT.getVectorElementType() == MVT::i32)
30771 break;
30772 [[fallthrough]];
30773 case MVT::nxv4i32:
30774 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
30775 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
30776 break;
30777 }
30778
30779 return convertFromScalableVector(DAG, VT, Val);
30780}
30781
30782SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
30783 SDValue Op, SelectionDAG &DAG) const {
30784 EVT VT = Op.getValueType();
30785 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30786
30787 SDLoc DL(Op);
30788 SDValue Val = Op.getOperand(0);
30789 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
30790 Val = convertToScalableVector(DAG, ContainerVT, Val);
30791
30792 // Repeatedly truncate Val until the result is of the desired element type.
30793 switch (ContainerVT.getSimpleVT().SimpleTy) {
30794 default:
30795 llvm_unreachable("unimplemented container type");
30796 case MVT::nxv2i64:
30797 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
30798 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
30799 if (VT.getVectorElementType() == MVT::i32)
30800 break;
30801 [[fallthrough]];
30802 case MVT::nxv4i32:
30803 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
30804 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
30805 if (VT.getVectorElementType() == MVT::i16)
30806 break;
30807 [[fallthrough]];
30808 case MVT::nxv8i16:
30809 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
30810 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
30811 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
30812 break;
30813 }
30814
30815 return convertFromScalableVector(DAG, VT, Val);
30816}
30817
30818SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
30819 SDValue Op, SelectionDAG &DAG) const {
30820 EVT VT = Op.getValueType();
30821 EVT InVT = Op.getOperand(0).getValueType();
30822 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
30823
30824 SDLoc DL(Op);
30825 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30826 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
30827
30828 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
30829}
30830
30831SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
30832 SDValue Op, SelectionDAG &DAG) const {
30833 EVT VT = Op.getValueType();
30834 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
30835
30836 SDLoc DL(Op);
30837 EVT InVT = Op.getOperand(0).getValueType();
30838 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
30839 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
30840
30841 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
30842 Op.getOperand(1), Op.getOperand(2));
30843
30844 return convertFromScalableVector(DAG, VT, ScalableRes);
30845}
30846
30847// Convert vector operation 'Op' to an equivalent predicated operation whereby
30848// the original operation's type is used to construct a suitable predicate.
30849// NOTE: The results for inactive lanes are undefined.
30850SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
30851 SelectionDAG &DAG,
30852 unsigned NewOp) const {
30853 EVT VT = Op.getValueType();
30854 SDLoc DL(Op);
30855 auto Pg = getPredicateForVector(DAG, DL, VT);
30856
30857 if (VT.isFixedLengthVector()) {
30858 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
30859 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30860
30861 // Create list of operands by converting existing ones to scalable types.
30862 SmallVector<SDValue, 4> Operands = {Pg};
30863 for (const SDValue &V : Op->op_values()) {
30864 if (isa<CondCodeSDNode>(V)) {
30865 Operands.push_back(V);
30866 continue;
30867 }
30868
30869 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
30870 EVT VTArg = VTNode->getVT().getVectorElementType();
30871 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
30872 Operands.push_back(DAG.getValueType(NewVTArg));
30873 continue;
30874 }
30875
30876 assert(isTypeLegal(V.getValueType()) &&
30877 "Expected only legal fixed-width types");
30878 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
30879 }
30880
30881 if (isMergePassthruOpcode(NewOp))
30882 Operands.push_back(DAG.getUNDEF(ContainerVT));
30883
30884 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
30885 return convertFromScalableVector(DAG, VT, ScalableRes);
30886 }
30887
30888 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
30889
30890 SmallVector<SDValue, 4> Operands = {Pg};
30891 for (const SDValue &V : Op->op_values()) {
30892 assert((!V.getValueType().isVector() ||
30893 V.getValueType().isScalableVector()) &&
30894 "Only scalable vectors are supported!");
30895 Operands.push_back(V);
30896 }
30897
30898 if (isMergePassthruOpcode(NewOp))
30899 Operands.push_back(DAG.getUNDEF(VT));
30900
30901 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
30902}
30903
30904// If a fixed length vector operation has no side effects when applied to
30905// undefined elements, we can safely use scalable vectors to perform the same
30906// operation without needing to worry about predication.
30907SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
30908 SelectionDAG &DAG) const {
30909 EVT VT = Op.getValueType();
30911 "Only expected to lower fixed length vector operation!");
30912 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
30913
30914 // Create list of operands by converting existing ones to scalable types.
30916 for (const SDValue &V : Op->op_values()) {
30917 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
30918
30919 // Pass through non-vector operands.
30920 if (!V.getValueType().isVector()) {
30921 Ops.push_back(V);
30922 continue;
30923 }
30924
30925 // "cast" fixed length vector to a scalable vector.
30926 assert(V.getValueType().isFixedLengthVector() &&
30927 isTypeLegal(V.getValueType()) &&
30928 "Only fixed length vectors are supported!");
30929 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
30930 }
30931
30932 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
30933 return convertFromScalableVector(DAG, VT, ScalableRes);
30934}
30935
30936SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
30937 SelectionDAG &DAG) const {
30938 SDLoc DL(ScalarOp);
30939 SDValue AccOp = ScalarOp.getOperand(0);
30940 SDValue VecOp = ScalarOp.getOperand(1);
30941 EVT SrcVT = VecOp.getValueType();
30942 EVT ResVT = SrcVT.getVectorElementType();
30943
30944 EVT ContainerVT = SrcVT;
30945 if (SrcVT.isFixedLengthVector()) {
30946 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
30947 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
30948 }
30949
30950 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
30951 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
30952
30953 // Convert operands to Scalable.
30954 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
30955 DAG.getUNDEF(ContainerVT), AccOp, Zero);
30956
30957 // Perform reduction.
30958 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
30959 Pg, AccOp, VecOp);
30960
30961 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
30962}
30963
30964SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
30965 SelectionDAG &DAG) const {
30966 SDLoc DL(ReduceOp);
30967 SDValue Op = ReduceOp.getOperand(0);
30968 EVT OpVT = Op.getValueType();
30969 EVT VT = ReduceOp.getValueType();
30970
30971 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
30972 return SDValue();
30973
30974 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
30975
30976 switch (ReduceOp.getOpcode()) {
30977 default:
30978 return SDValue();
30979 case ISD::VECREDUCE_OR:
30980 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
30981 // The predicate can be 'Op' because
30982 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
30983 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
30984 else
30985 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
30986 case ISD::VECREDUCE_AND: {
30987 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
30988 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
30989 }
30990 case ISD::VECREDUCE_XOR: {
30991 SDValue ID =
30992 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
30993 if (OpVT == MVT::nxv1i1) {
30994 // Emulate a CNTP on .Q using .D and a different governing predicate.
30995 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
30996 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
30997 }
30998 SDValue Cntp =
30999 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
31000 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
31001 }
31002 }
31003
31004 return SDValue();
31005}
31006
31007SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
31008 SDValue ScalarOp,
31009 SelectionDAG &DAG) const {
31010 SDLoc DL(ScalarOp);
31011 SDValue VecOp = ScalarOp.getOperand(0);
31012 EVT SrcVT = VecOp.getValueType();
31013
31015 SrcVT,
31016 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
31017 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
31018 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
31019 }
31020
31021 // Lower VECREDUCE_ADD of nxv2i1-nxv16i1 to CNTP rather than UADDV.
31022 if (ScalarOp.getOpcode() == ISD::VECREDUCE_ADD &&
31023 VecOp.getOpcode() == ISD::ZERO_EXTEND) {
31024 SDValue BoolVec = VecOp.getOperand(0);
31025 if (BoolVec.getValueType().getVectorElementType() == MVT::i1) {
31026 // CNTP(BoolVec & BoolVec) <=> CNTP(BoolVec & PTRUE)
31027 SDValue CntpOp = DAG.getNode(
31028 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
31029 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64),
31030 BoolVec, BoolVec);
31031 return DAG.getAnyExtOrTrunc(CntpOp, DL, ScalarOp.getValueType());
31032 }
31033 }
31034
31035 // UADDV always returns an i64 result.
31036 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
31037 SrcVT.getVectorElementType();
31038 EVT RdxVT = SrcVT;
31039 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
31040 RdxVT = getPackedSVEVectorVT(ResVT);
31041
31042 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
31043 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
31044 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
31045 Rdx, DAG.getConstant(0, DL, MVT::i64));
31046
31047 // The VEC_REDUCE nodes expect an element size result.
31048 if (ResVT != ScalarOp.getValueType())
31049 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
31050
31051 return Res;
31052}
31053
31054SDValue
31055AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
31056 SelectionDAG &DAG) const {
31057 EVT VT = Op.getValueType();
31058 SDLoc DL(Op);
31059
31060 EVT InVT = Op.getOperand(1).getValueType();
31061 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
31062 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
31063 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
31064
31065 // Convert the mask to a predicated (NOTE: We don't need to worry about
31066 // inactive lanes since VSELECT is safe when given undefined elements).
31067 EVT MaskVT = Op.getOperand(0).getValueType();
31068 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
31069 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
31071 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
31072
31073 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
31074 Mask, Op1, Op2);
31075
31076 return convertFromScalableVector(DAG, VT, ScalableRes);
31077}
31078
31079SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
31080 SDValue Op, SelectionDAG &DAG) const {
31081 SDLoc DL(Op);
31082 EVT InVT = Op.getOperand(0).getValueType();
31083 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
31084
31085 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
31086 "Only expected to lower fixed length vector operation!");
31087 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
31088 "Expected integer result of the same bit length as the inputs!");
31089
31090 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
31091 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
31092 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
31093
31094 EVT CmpVT = Pg.getValueType();
31095 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
31096 {Pg, Op1, Op2, Op.getOperand(2)});
31097
31098 EVT PromoteVT = ContainerVT.changeTypeToInteger();
31099 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
31100 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
31101}
31102
31103SDValue
31104AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
31105 SelectionDAG &DAG) const {
31106 SDLoc DL(Op);
31107 auto SrcOp = Op.getOperand(0);
31108 EVT VT = Op.getValueType();
31109 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
31110 EVT ContainerSrcVT =
31112
31113 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
31114 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
31115 return convertFromScalableVector(DAG, VT, Op);
31116}
31117
31118SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
31119 SDValue Op, SelectionDAG &DAG) const {
31120 SDLoc DL(Op);
31121 unsigned NumOperands = Op->getNumOperands();
31122
31123 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
31124 "Unexpected number of operands in CONCAT_VECTORS");
31125
31126 auto SrcOp1 = Op.getOperand(0);
31127 auto SrcOp2 = Op.getOperand(1);
31128 EVT VT = Op.getValueType();
31129 EVT SrcVT = SrcOp1.getValueType();
31130
31131 // Match a splat of 128b segments that fit in a single register.
31132 if (SrcVT.is128BitVector() && all_equal(Op.getNode()->op_values())) {
31133 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31134 SDValue Splat =
31135 DAG.getNode(AArch64ISD::DUPLANE128, DL, ContainerVT,
31136 convertToScalableVector(DAG, ContainerVT, SrcOp1),
31137 DAG.getConstant(0, DL, MVT::i64, /*isTarget=*/true));
31138 return convertFromScalableVector(DAG, VT, Splat);
31139 }
31140
31141 if (NumOperands > 2) {
31143 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
31144 for (unsigned I = 0; I < NumOperands; I += 2)
31145 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
31146 Op->getOperand(I), Op->getOperand(I + 1)));
31147
31148 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
31149 }
31150
31151 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31152
31154 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
31155 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
31156
31157 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
31158
31159 return convertFromScalableVector(DAG, VT, Op);
31160}
31161
31162SDValue
31163AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
31164 SelectionDAG &DAG) const {
31165 EVT VT = Op.getValueType();
31166 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
31167
31168 SDLoc DL(Op);
31169 SDValue Val = Op.getOperand(0);
31170 SDValue Pg = getPredicateForVector(DAG, DL, VT);
31171 EVT SrcVT = Val.getValueType();
31172 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31173 EVT ExtendVT = ContainerVT.changeVectorElementType(
31174 SrcVT.getVectorElementType());
31175
31176 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
31177 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
31178
31179 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
31180 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
31181 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
31182 Pg, Val, DAG.getUNDEF(ContainerVT));
31183
31184 return convertFromScalableVector(DAG, VT, Val);
31185}
31186
31187SDValue
31188AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
31189 SelectionDAG &DAG) const {
31190 EVT VT = Op.getValueType();
31191 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
31192
31193 SDLoc DL(Op);
31194 SDValue Val = Op.getOperand(0);
31195 EVT SrcVT = Val.getValueType();
31196 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
31197 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
31199 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
31200
31201 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
31202 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
31203 Op.getOperand(1), DAG.getUNDEF(RoundVT));
31204 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
31205 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
31206
31207 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
31208 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
31209}
31210
31211SDValue
31212AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
31213 SelectionDAG &DAG) const {
31214 EVT VT = Op.getValueType();
31215 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
31216
31217 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
31218 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
31219 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
31220
31221 SDLoc DL(Op);
31222 SDValue Val = Op.getOperand(0);
31223 EVT SrcVT = Val.getValueType();
31224 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
31225 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
31226
31227 if (VT.bitsGE(SrcVT)) {
31229
31230 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
31231 VT.changeTypeToInteger(), Val);
31232
31233 // Safe to use a larger than specified operand because by promoting the
31234 // value nothing has changed from an arithmetic point of view.
31235 Val =
31236 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
31237 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
31238 DAG.getUNDEF(ContainerDstVT));
31239 return convertFromScalableVector(DAG, VT, Val);
31240 } else {
31241 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
31242 ContainerDstVT.getVectorElementType());
31244
31245 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
31246 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
31247 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
31248 Val = convertFromScalableVector(DAG, SrcVT, Val);
31249
31250 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
31251 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
31252 }
31253}
31254
31255SDValue
31256AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
31257 SelectionDAG &DAG) const {
31258 SDLoc DL(Op);
31259 EVT OpVT = Op.getValueType();
31260 assert(OpVT.isScalableVector() &&
31261 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
31262
31263 if (Op->getNumOperands() == 3) {
31264 // aarch64_sve_ld3 only supports packed datatypes.
31265 EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount());
31266 Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false);
31268 DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment);
31269
31270 // Write out unmodified operands.
31272 for (unsigned I = 0; I < 3; ++I) {
31273 SDValue Ptr =
31274 DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL);
31275 SDValue V = getSVESafeBitCast(PackedVT, Op.getOperand(I), DAG);
31276 Chains.push_back(
31277 DAG.getStore(DAG.getEntryNode(), DL, V, Ptr, MachinePointerInfo()));
31278 }
31279
31280 Intrinsic::ID IntID = Intrinsic::aarch64_sve_ld3_sret;
31281 EVT PredVT = PackedVT.changeVectorElementType(MVT::i1);
31282
31284 Ops.push_back(DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains));
31285 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
31286 Ops.push_back(DAG.getConstant(1, DL, PredVT));
31287 Ops.push_back(StackPtr);
31288
31289 // Read back and deinterleave data.
31290 SDVTList VTs = DAG.getVTList(PackedVT, PackedVT, PackedVT, MVT::Other);
31291 SDValue LD3 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops);
31292
31294 Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(0), DAG));
31295 Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(1), DAG));
31296 Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(2), DAG));
31297 return DAG.getMergeValues(Results, DL);
31298 }
31299
31300 // Are multi-register uzp instructions available?
31301 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
31302 OpVT.getVectorElementType() != MVT::i1) {
31303 Intrinsic::ID IntID;
31304 switch (Op->getNumOperands()) {
31305 default:
31306 return SDValue();
31307 case 2:
31308 IntID = Intrinsic::aarch64_sve_uzp_x2;
31309 break;
31310 case 4:
31311 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
31312 OpVT.getScalarSizeInBits() == 64)
31313 return SDValue();
31314 IntID = Intrinsic::aarch64_sve_uzp_x4;
31315 break;
31316 }
31317
31319 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
31320 Ops.append(Op->op_values().begin(), Op->op_values().end());
31321 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
31322 }
31323
31324 if (Op->getNumOperands() != 2)
31325 return SDValue();
31326
31327 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
31328 Op.getOperand(1));
31329 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
31330 Op.getOperand(1));
31331 return DAG.getMergeValues({Even, Odd}, DL);
31332}
31333
31334SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
31335 SelectionDAG &DAG) const {
31336 SDLoc DL(Op);
31337 EVT OpVT = Op.getValueType();
31338 assert(OpVT.isScalableVector() &&
31339 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
31340
31341 if (Op->getNumOperands() == 3) {
31342 // aarch64_sve_st3 only supports packed datatypes.
31343 EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount());
31345 for (SDValue V : Op->ops())
31346 InVecs.push_back(getSVESafeBitCast(PackedVT, V, DAG));
31347
31348 Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false);
31350 DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment);
31351
31352 Intrinsic::ID IntID = Intrinsic::aarch64_sve_st3;
31353 EVT PredVT = PackedVT.changeVectorElementType(MVT::i1);
31354
31356 Ops.push_back(DAG.getEntryNode());
31357 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
31358 Ops.append(InVecs);
31359 Ops.push_back(DAG.getConstant(1, DL, PredVT));
31360 Ops.push_back(StackPtr);
31361
31362 // Interleave operands and store.
31363 SDValue Chain = DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops);
31364
31365 // Read back the interleaved data.
31367 for (unsigned I = 0; I < 3; ++I) {
31368 SDValue Ptr =
31369 DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL);
31370 SDValue L = DAG.getLoad(PackedVT, DL, Chain, Ptr, MachinePointerInfo());
31371 Results.push_back(getSVESafeBitCast(OpVT, L, DAG));
31372 }
31373
31374 return DAG.getMergeValues(Results, DL);
31375 }
31376
31377 // Are multi-register zip instructions available?
31378 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
31379 OpVT.getVectorElementType() != MVT::i1) {
31380 Intrinsic::ID IntID;
31381 switch (Op->getNumOperands()) {
31382 default:
31383 return SDValue();
31384 case 2:
31385 IntID = Intrinsic::aarch64_sve_zip_x2;
31386 break;
31387 case 4:
31388 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
31389 OpVT.getScalarSizeInBits() == 64)
31390 return SDValue();
31391 IntID = Intrinsic::aarch64_sve_zip_x4;
31392 break;
31393 }
31394
31396 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
31397 Ops.append(Op->op_values().begin(), Op->op_values().end());
31398 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
31399 }
31400
31401 if (Op->getNumOperands() != 2)
31402 return SDValue();
31403
31404 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
31405 Op.getOperand(1));
31406 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
31407 Op.getOperand(1));
31408 return DAG.getMergeValues({Lo, Hi}, DL);
31409}
31410
31411SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
31412 SelectionDAG &DAG) const {
31413 // FIXME: Maybe share some code with LowerMGather/Scatter?
31414 MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
31415 SDLoc DL(HG);
31416 SDValue Chain = HG->getChain();
31417 SDValue Inc = HG->getInc();
31418 SDValue Mask = HG->getMask();
31419 SDValue Ptr = HG->getBasePtr();
31420 SDValue Index = HG->getIndex();
31421 SDValue Scale = HG->getScale();
31422 SDValue IntID = HG->getIntID();
31423
31424 // The Intrinsic ID determines the type of update operation.
31425 [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
31426 // Right now, we only support 'add' as an update.
31427 assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
31428 "Unexpected histogram update operation");
31429
31430 EVT IndexVT = Index.getValueType();
31431 LLVMContext &Ctx = *DAG.getContext();
31432 ElementCount EC = IndexVT.getVectorElementCount();
31433 EVT MemVT = EVT::getVectorVT(Ctx, HG->getMemoryVT(), EC);
31434 EVT IncExtVT =
31435 EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
31436 EVT IncSplatVT = EVT::getVectorVT(Ctx, IncExtVT, EC);
31437 bool ExtTrunc = IncSplatVT != MemVT;
31438
31439 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
31440 SDValue PassThru = DAG.getSplatVector(IncSplatVT, DL, Zero);
31441 SDValue IncSplat = DAG.getSplatVector(
31442 IncSplatVT, DL, DAG.getAnyExtOrTrunc(Inc, DL, IncExtVT));
31443 SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
31444
31445 MachineMemOperand *MMO = HG->getMemOperand();
31446 // Create an MMO for the gather, without load|store flags.
31447 MachineMemOperand *GMMO = DAG.getMachineFunction().getMachineMemOperand(
31449 MMO->getAlign(), MMO->getAAInfo());
31450 ISD::MemIndexType IndexType = HG->getIndexType();
31451 SDValue Gather = DAG.getMaskedGather(
31452 DAG.getVTList(IncSplatVT, MVT::Other), MemVT, DL, Ops, GMMO, IndexType,
31453 ExtTrunc ? ISD::EXTLOAD : ISD::NON_EXTLOAD);
31454
31455 SDValue GChain = Gather.getValue(1);
31456
31457 // Perform the histcnt, multiply by inc, add to bucket data.
31458 SDValue ID =
31459 DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncExtVT);
31460 SDValue HistCnt =
31461 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
31462 SDValue Mul = DAG.getNode(ISD::MUL, DL, IncSplatVT, HistCnt, IncSplat);
31463 SDValue Add = DAG.getNode(ISD::ADD, DL, IncSplatVT, Gather, Mul);
31464
31465 // Create an MMO for the scatter, without load|store flags.
31466 MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
31468 MMO->getAlign(), MMO->getAAInfo());
31469
31470 SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
31471 SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
31472 ScatterOps, SMMO, IndexType, ExtTrunc);
31473 return Scatter;
31474}
31475
31476/// If a PARTIAL_REDUCE_MLA node comes in with an accumulator-input type pairing
31477/// of (nx)v2i64/(nx)v16i8, we cannot directly lower it to a (u|s)dot. We can
31478/// however still make use of the dot product instruction by instead
31479/// accumulating over two steps: (nx)v16i8 -> (nx)v4i32 -> (nx)v2i64.
31480/// If available, make use of the (U|S)ADDW(B|T) instructions, otherwise
31481/// the following pattern is emitted:
31482/// add(add(Acc, ext(EXTRACT_SUBVECTOR(N, 0)), ext(EXTRACT_SUBVECTOR(N,
31483/// NTy/2))))
31484SDValue
31485AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
31486 SelectionDAG &DAG) const {
31487 SDLoc DL(Op);
31488
31489 SDValue Acc = Op.getOperand(0);
31490 SDValue LHS = Op.getOperand(1);
31491 SDValue RHS = Op.getOperand(2);
31492 EVT ResultVT = Op.getValueType();
31493 EVT OrigResultVT = ResultVT;
31494 EVT OpVT = LHS.getValueType();
31495
31496 bool ConvertToScalable =
31497 ResultVT.isFixedLengthVector() &&
31498 useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true);
31499
31500 // We can handle this case natively by accumulating into a wider
31501 // zero-padded vector.
31502 if (!ConvertToScalable && ResultVT == MVT::v2i32 && OpVT == MVT::v16i8) {
31503 SDValue ZeroVec = DAG.getConstant(0, DL, MVT::v4i32);
31504 SDValue WideAcc = DAG.getInsertSubvector(DL, ZeroVec, Acc, 0);
31505 SDValue Wide =
31506 DAG.getNode(Op.getOpcode(), DL, MVT::v4i32, WideAcc, LHS, RHS);
31507 SDValue Reduced = DAG.getNode(AArch64ISD::ADDP, DL, MVT::v4i32, Wide, Wide);
31508 return DAG.getExtractSubvector(DL, MVT::v2i32, Reduced, 0);
31509 }
31510
31511 if (ConvertToScalable) {
31512 ResultVT = getContainerForFixedLengthVector(DAG, ResultVT);
31513 OpVT = getContainerForFixedLengthVector(DAG, LHS.getValueType());
31514 Acc = convertToScalableVector(DAG, ResultVT, Acc);
31515 LHS = convertToScalableVector(DAG, OpVT, LHS);
31516 RHS = convertToScalableVector(DAG, OpVT, RHS);
31517 Op = DAG.getNode(Op.getOpcode(), DL, ResultVT, {Acc, LHS, RHS});
31518 }
31519
31520 // Two-way and four-way partial reductions are supported by patterns.
31521 // We only need to handle the 8-way partial reduction.
31522 if (ResultVT.getScalarType() != MVT::i64 || OpVT.getScalarType() != MVT::i8)
31523 return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Op)
31524 : Op;
31525
31526 EVT DotVT = ResultVT.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
31527 SDValue DotNode = DAG.getNode(Op.getOpcode(), DL, DotVT,
31528 DAG.getConstant(0, DL, DotVT), LHS, RHS);
31529
31530 SDValue Res;
31531 bool IsUnsigned = Op.getOpcode() == ISD::PARTIAL_REDUCE_UMLA;
31532 if (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable()) {
31533 unsigned LoOpcode = IsUnsigned ? AArch64ISD::UADDWB : AArch64ISD::SADDWB;
31534 unsigned HiOpcode = IsUnsigned ? AArch64ISD::UADDWT : AArch64ISD::SADDWT;
31535 SDValue Lo = DAG.getNode(LoOpcode, DL, ResultVT, Acc, DotNode);
31536 Res = DAG.getNode(HiOpcode, DL, ResultVT, Lo, DotNode);
31537 } else {
31538 // Fold (nx)v4i32 into (nx)v2i64
31539 auto [DotNodeLo, DotNodeHi] = DAG.SplitVector(DotNode, DL);
31540 if (IsUnsigned) {
31541 DotNodeLo = DAG.getZExtOrTrunc(DotNodeLo, DL, ResultVT);
31542 DotNodeHi = DAG.getZExtOrTrunc(DotNodeHi, DL, ResultVT);
31543 } else {
31544 DotNodeLo = DAG.getSExtOrTrunc(DotNodeLo, DL, ResultVT);
31545 DotNodeHi = DAG.getSExtOrTrunc(DotNodeHi, DL, ResultVT);
31546 }
31547 auto Lo = DAG.getNode(ISD::ADD, DL, ResultVT, Acc, DotNodeLo);
31548 Res = DAG.getNode(ISD::ADD, DL, ResultVT, Lo, DotNodeHi);
31549 }
31550
31551 return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Res)
31552 : Res;
31553}
31554
31555SDValue
31556AArch64TargetLowering::LowerGET_ACTIVE_LANE_MASK(SDValue Op,
31557 SelectionDAG &DAG) const {
31558 EVT VT = Op.getValueType();
31559 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
31560
31561 assert(Subtarget->isSVEorStreamingSVEAvailable() &&
31562 "Lowering fixed length get_active_lane_mask requires SVE!");
31563
31564 // There are no dedicated fixed-length instructions for GET_ACTIVE_LANE_MASK,
31565 // but we can use SVE when available.
31566
31567 SDLoc DL(Op);
31568 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31569 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
31570
31571 SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WhileVT,
31572 Op.getOperand(0), Op.getOperand(1));
31573 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
31574 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
31575 DAG.getVectorIdxConstant(0, DL));
31576}
31577
31578SDValue
31579AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
31580 SelectionDAG &DAG) const {
31581 EVT VT = Op.getValueType();
31582 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
31583
31584 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
31585 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
31586 : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
31587
31588 SDLoc DL(Op);
31589 SDValue Val = Op.getOperand(0);
31590 EVT SrcVT = Val.getValueType();
31591 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
31592 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
31593
31594 if (VT.bitsGT(SrcVT)) {
31595 EVT CvtVT = ContainerDstVT.changeVectorElementType(
31596 ContainerSrcVT.getVectorElementType());
31598
31599 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
31600 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
31601
31602 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
31603 Val = getSVESafeBitCast(CvtVT, Val, DAG);
31604 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
31605 DAG.getUNDEF(ContainerDstVT));
31606 return convertFromScalableVector(DAG, VT, Val);
31607 } else {
31608 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
31610
31611 // Safe to use a larger than specified result since an fp_to_int where the
31612 // result doesn't fit into the destination is undefined.
31613 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
31614 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
31615 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
31616
31617 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
31618 }
31619}
31620
31622 ArrayRef<int> ShuffleMask, EVT VT,
31623 EVT ContainerVT, SelectionDAG &DAG) {
31624 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
31625 SDLoc DL(Op);
31626 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
31627 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
31628 bool IsSingleOp =
31629 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
31630
31631 if (!Subtarget.isNeonAvailable() && !MinSVESize)
31632 MinSVESize = 128;
31633
31634 // Ignore two operands if no SVE2 or all index numbers couldn't
31635 // be represented.
31636 if (!IsSingleOp && !Subtarget.hasSVE2())
31637 return SDValue();
31638
31639 EVT VTOp1 = Op.getOperand(0).getValueType();
31640 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
31641 unsigned IndexLen = MinSVESize / BitsPerElt;
31642 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
31643 uint64_t MaxOffset = maxUIntN(BitsPerElt);
31644 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
31645 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
31646 bool MinMaxEqual = (MinSVESize == MaxSVESize);
31647 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
31648 "Incorrectly legalised shuffle operation");
31649
31651 // If MinSVESize is not equal to MaxSVESize then we need to know which
31652 // TBL mask element needs adjustment.
31653 SmallVector<SDValue, 8> AddRuntimeVLMask;
31654
31655 // Bail out for 8-bits element types, because with 2048-bit SVE register
31656 // size 8 bits is only sufficient to index into the first source vector.
31657 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
31658 return SDValue();
31659
31660 for (int Index : ShuffleMask) {
31661 // Handling poison index value.
31662 if (Index < 0)
31663 Index = 0;
31664 // If the mask refers to elements in the second operand, then we have to
31665 // offset the index by the number of elements in a vector. If this is number
31666 // is not known at compile-time, we need to maintain a mask with 'VL' values
31667 // to add at runtime.
31668 if ((unsigned)Index >= ElementsPerVectorReg) {
31669 if (MinMaxEqual) {
31670 Index += IndexLen - ElementsPerVectorReg;
31671 } else {
31672 Index = Index - ElementsPerVectorReg;
31673 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
31674 }
31675 } else if (!MinMaxEqual)
31676 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
31677 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
31678 // to 255, this might point to the last element of in the second operand
31679 // of the shufflevector, thus we are rejecting this transform.
31680 if ((unsigned)Index >= MaxOffset)
31681 return SDValue();
31682 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
31683 }
31684
31685 // Choosing an out-of-range index leads to the lane being zeroed vs zero
31686 // value where it would perform first lane duplication for out of
31687 // index elements. For i8 elements an out-of-range index could be a valid
31688 // for 2048-bit vector register size.
31689 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
31690 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
31691 if (!MinMaxEqual)
31692 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
31693 }
31694
31695 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
31696 SDValue VecMask =
31697 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
31698 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
31699
31700 SDValue Shuffle;
31701 if (IsSingleOp)
31702 Shuffle = DAG.getNode(
31703 ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
31704 DAG.getTargetConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32), Op1,
31705 SVEMask);
31706 else if (Subtarget.hasSVE2()) {
31707 if (!MinMaxEqual) {
31708 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
31709 SDValue VScale = (BitsPerElt == 64)
31710 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
31711 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
31712 SDValue VecMask =
31713 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
31714 SDValue MulByMask = DAG.getNode(
31715 ISD::MUL, DL, MaskType,
31716 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
31717 DAG.getBuildVector(MaskType, DL,
31718 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
31719 SDValue UpdatedVecMask =
31720 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
31721 SVEMask = convertToScalableVector(
31722 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
31723 }
31724 Shuffle = DAG.getNode(
31725 ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
31726 DAG.getTargetConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32), Op1,
31727 Op2, SVEMask);
31728 }
31729 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
31730 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
31731}
31732
31733SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
31734 SDValue Op, SelectionDAG &DAG) const {
31735 EVT VT = Op.getValueType();
31736 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
31737
31738 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
31739 auto ShuffleMask = SVN->getMask();
31740
31741 SDLoc DL(Op);
31742 SDValue Op1 = Op.getOperand(0);
31743 SDValue Op2 = Op.getOperand(1);
31744
31745 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31746 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
31747 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
31748
31749 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
31750 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
31751 return MVT::i32;
31752 return ScalarTy;
31753 };
31754
31755 if (SVN->isSplat()) {
31756 unsigned Lane = std::max(0, SVN->getSplatIndex());
31757 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
31758 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
31759 DAG.getConstant(Lane, DL, MVT::i64));
31760 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
31761 return convertFromScalableVector(DAG, VT, Op);
31762 }
31763
31764 bool ReverseEXT = false;
31765 unsigned Imm;
31766 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
31767 Imm == VT.getVectorNumElements() - 1) {
31768 if (ReverseEXT)
31769 std::swap(Op1, Op2);
31770 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
31771 SDValue Scalar = DAG.getNode(
31772 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
31773 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
31774 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
31775 return convertFromScalableVector(DAG, VT, Op);
31776 }
31777
31778 unsigned EltSize = VT.getScalarSizeInBits();
31779 for (unsigned BlockSize : {64U, 32U, 16U}) {
31780 if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), BlockSize)) {
31781 unsigned RevOp;
31782 if (EltSize == 8)
31783 RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
31784 else if (EltSize == 16)
31785 RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
31786 else
31787 RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
31788 EVT BlockedVT =
31790 SDValue Pg = getPredicateForVector(DAG, DL, BlockedVT);
31791 SDValue BlockedOp1 = DAG.getNode(ISD::BITCAST, DL, BlockedVT, Op1);
31792 SDValue BlockedRev = DAG.getNode(RevOp, DL, BlockedVT, Pg, BlockedOp1,
31793 DAG.getUNDEF(BlockedVT));
31794 SDValue Container =
31795 DAG.getNode(ISD::BITCAST, DL, ContainerVT, BlockedRev);
31796 return convertFromScalableVector(DAG, VT, Container);
31797 }
31798 }
31799
31800 if (Subtarget->hasSVE2p1() && EltSize == 64 &&
31801 isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
31802 SDValue Pg = getPredicateForVector(DAG, DL, VT);
31803 SDValue Revd = DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, ContainerVT,
31804 Pg, Op1, DAG.getUNDEF(ContainerVT));
31805 return convertFromScalableVector(DAG, VT, Revd);
31806 }
31807
31808 unsigned WhichResult;
31809 unsigned OperandOrder;
31810 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
31811 OperandOrder) &&
31812 WhichResult == 0) {
31813 SDValue ZIP = DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT,
31814 OperandOrder == 0 ? Op1 : Op2,
31815 OperandOrder == 0 ? Op2 : Op1);
31816 return convertFromScalableVector(DAG, VT, ZIP);
31817 }
31818
31819 if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
31820 OperandOrder)) {
31821 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
31822 SDValue TRN =
31823 DAG.getNode(Opc, DL, ContainerVT, OperandOrder == 0 ? Op1 : Op2,
31824 OperandOrder == 0 ? Op2 : Op1);
31825 return convertFromScalableVector(DAG, VT, TRN);
31826 }
31827
31828 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
31830 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
31831
31832 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
31833 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
31835 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
31836 }
31837
31838 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
31839 // represents the same logical operation as performed by a ZIP instruction. In
31840 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
31841 // equivalent to an AArch64 instruction. There's the extra component of
31842 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
31843 // only operated on 64/128bit vector types that have a direct mapping to a
31844 // target register and so an exact mapping is implied.
31845 // However, when using SVE for fixed length vectors, most legal vector types
31846 // are actually sub-vectors of a larger SVE register. When mapping
31847 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
31848 // how the mask's indices translate. Specifically, when the mapping requires
31849 // an exact meaning for a specific vector index (e.g. Index X is the last
31850 // vector element in the register) then such mappings are often only safe when
31851 // the exact SVE register size is know. The main exception to this is when
31852 // indices are logically relative to the first element of either
31853 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
31854 // when converting from fixed-length to scalable vector types (i.e. the start
31855 // of a fixed length vector is always the start of a scalable vector).
31856 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
31857 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
31858 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
31859 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
31860 Op2.isUndef()) {
31861 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
31862 return convertFromScalableVector(DAG, VT, Op);
31863 }
31864
31865 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
31866 OperandOrder) &&
31867 WhichResult != 0) {
31868 SDValue ZIP = DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT,
31869 OperandOrder == 0 ? Op1 : Op2,
31870 OperandOrder == 0 ? Op2 : Op1);
31871 return convertFromScalableVector(DAG, VT, ZIP);
31872 }
31873
31874 if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
31875 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
31877 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
31878 }
31879
31880 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
31882 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
31883
31884 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
31885 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
31887 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
31888 }
31889
31890 if ((Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) &&
31891 Subtarget->isSVEorStreamingSVEAvailable()) {
31893 "Unsupported SVE vector size");
31894
31896 unsigned SegmentElts = VT.getVectorNumElements() / Segments;
31897 if (std::optional<unsigned> Lane =
31898 isDUPQMask(ShuffleMask, Segments, SegmentElts)) {
31899 SDValue IID = DAG.getTargetConstant(Intrinsic::aarch64_sve_dup_laneq,
31900 DL, MVT::i64);
31902 DAG, VT,
31903 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
31904 {IID, Op1,
31905 DAG.getConstant(*Lane, DL, MVT::i64,
31906 /*isTarget=*/true)}));
31907 }
31908 }
31909 }
31910
31911 // Try to widen the shuffle before generating a possibly expensive SVE TBL.
31912 // This may allow the shuffle to be matched as something cheaper like ZIP1.
31913 if (SDValue WideOp = tryWidenMaskForShuffle(Op, DAG))
31914 return WideOp;
31915
31916 // Avoid producing TBL instruction if we don't know SVE register minimal size,
31917 // unless NEON is not available and we can assume minimal SVE register size is
31918 // 128-bits.
31919 if (MinSVESize || !Subtarget->isNeonAvailable())
31920 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
31921 DAG);
31922
31923 return SDValue();
31924}
31925
31926SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
31927 SelectionDAG &DAG) const {
31928 SDLoc DL(Op);
31929 EVT InVT = Op.getValueType();
31930
31931 assert(VT.isScalableVector() && isTypeLegal(VT) &&
31932 InVT.isScalableVector() && isTypeLegal(InVT) &&
31933 "Only expect to cast between legal scalable vector types!");
31934 assert(VT.getVectorElementType() != MVT::i1 &&
31935 InVT.getVectorElementType() != MVT::i1 &&
31936 "For predicate bitcasts, use getSVEPredicateBitCast");
31937
31938 if (InVT == VT)
31939 return Op;
31940
31941 EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
31942 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
31943
31944 // Safe bitcasting between unpacked vector types of different element counts
31945 // is currently unsupported because the following is missing the necessary
31946 // work to ensure the result's elements live where they're supposed to within
31947 // an SVE register.
31948 // 01234567
31949 // e.g. nxv2i32 = XX??XX??
31950 // nxv4f16 = X?X?X?X?
31952 VT == PackedVT || InVT == PackedInVT) &&
31953 "Unexpected bitcast!");
31954
31955 // Pack input if required.
31956 if (InVT != PackedInVT)
31957 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
31958
31959 if (Subtarget->isLittleEndian() ||
31960 PackedVT.getScalarSizeInBits() == PackedInVT.getScalarSizeInBits())
31961 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
31962 else {
31963 EVT PackedVTAsInt = PackedVT.changeTypeToInteger();
31964 EVT PackedInVTAsInt = PackedInVT.changeTypeToInteger();
31965
31966 // Simulate the effect of casting through memory.
31967 Op = DAG.getNode(ISD::BITCAST, DL, PackedInVTAsInt, Op);
31968 if (PackedInVTAsInt.getScalarSizeInBits() != 8)
31969 Op = DAG.getNode(ISD::BSWAP, DL, PackedInVTAsInt, Op);
31970 Op = DAG.getNode(AArch64ISD::NVCAST, DL, PackedVTAsInt, Op);
31971 if (PackedVTAsInt.getScalarSizeInBits() != 8)
31972 Op = DAG.getNode(ISD::BSWAP, DL, PackedVTAsInt, Op);
31973 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
31974 }
31975
31976 // Unpack result if required.
31977 if (VT != PackedVT)
31978 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
31979
31980 return Op;
31981}
31982
31984 SDValue N) const {
31985 return ::isAllActivePredicate(DAG, N);
31986}
31987
31989 return ::getPromotedVTForPredicate(VT);
31990}
31991
31992bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
31993 SDValue Op, const APInt &OriginalDemandedBits,
31994 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
31995 unsigned Depth) const {
31996
31997 unsigned Opc = Op.getOpcode();
31998 switch (Opc) {
31999 case AArch64ISD::VSHL: {
32000 // Match (VSHL (VLSHR Val X) X)
32001 SDValue ShiftL = Op;
32002 SDValue ShiftR = Op->getOperand(0);
32003 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
32004 return false;
32005
32006 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
32007 return false;
32008
32009 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
32010 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
32011
32012 // Other cases can be handled as well, but this is not
32013 // implemented.
32014 if (ShiftRBits != ShiftLBits)
32015 return false;
32016
32017 unsigned ScalarSize = Op.getScalarValueSizeInBits();
32018 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
32019
32020 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
32021 APInt UnusedBits = ~OriginalDemandedBits;
32022
32023 if ((ZeroBits & UnusedBits) != ZeroBits)
32024 return false;
32025
32026 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
32027 // used - simplify to just Val.
32028 return TLO.CombineTo(Op, ShiftR->getOperand(0));
32029 }
32030 case AArch64ISD::BICi: {
32031 // Fold BICi if all destination bits already known to be zeroed
32032 SDValue Op0 = Op.getOperand(0);
32033 KnownBits KnownOp0 =
32034 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
32035 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
32036 APInt BitsToClear =
32037 (Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
32038 .trunc(KnownOp0.getBitWidth());
32039 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
32040 if (BitsToClear.isSubsetOf(AlreadyZeroedBitsToClear))
32041 return TLO.CombineTo(Op, Op0);
32042
32043 Known = KnownOp0 & KnownBits::makeConstant(~BitsToClear);
32044 return false;
32045 }
32047 std::optional<ElementCount> MaxCount = getMaxValueForSVECntIntrinsic(Op);
32048 if (!MaxCount)
32049 return false;
32050 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
32051 if (!MaxSVEVectorSizeInBits)
32052 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
32053 unsigned VscaleMax = MaxSVEVectorSizeInBits / 128;
32054 unsigned MaxValue = MaxCount->getKnownMinValue() * VscaleMax;
32055 // The SVE count intrinsics don't support the multiplier immediate so we
32056 // don't have to account for that here. The value returned may be slightly
32057 // over the true required bits, as this is based on the "ALL" pattern. The
32058 // other patterns are also exposed by these intrinsics, but they all
32059 // return a value that's strictly less than "ALL".
32060 unsigned RequiredBits = llvm::bit_width(MaxValue);
32061 unsigned BitWidth = Known.Zero.getBitWidth();
32062 if (RequiredBits < BitWidth)
32063 Known.Zero.setHighBits(BitWidth - RequiredBits);
32064 return false;
32065 }
32066 }
32067
32069 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
32070}
32071
32072bool AArch64TargetLowering::canCreateUndefOrPoisonForTargetNode(
32073 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
32074 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
32075
32076 // TODO: Add more target nodes.
32077 switch (Op.getOpcode()) {
32078 case AArch64ISD::MOVI:
32079 case AArch64ISD::MOVIedit:
32080 case AArch64ISD::MOVImsl:
32081 case AArch64ISD::MOVIshift:
32082 case AArch64ISD::MVNImsl:
32083 case AArch64ISD::MVNIshift:
32084 case AArch64ISD::VASHR:
32085 case AArch64ISD::VLSHR:
32086 case AArch64ISD::VSHL:
32087 return false;
32088 }
32090 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
32091}
32092
32093bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
32094 return Op.getOpcode() == AArch64ISD::DUP ||
32095 Op.getOpcode() == AArch64ISD::MOVI ||
32096 Op.getOpcode() == AArch64ISD::MOVIshift ||
32097 Op.getOpcode() == AArch64ISD::MOVImsl ||
32098 Op.getOpcode() == AArch64ISD::MOVIedit ||
32099 Op.getOpcode() == AArch64ISD::MVNIshift ||
32100 Op.getOpcode() == AArch64ISD::MVNImsl ||
32101 // Ignoring fneg(movi(0)), because if it is folded to FPConstant(-0.0),
32102 // ISel will select fmov(mov i64 0x8000000000000000), resulting in a
32103 // fmov from fpr to gpr, which is more expensive than fneg(movi(0))
32104 (Op.getOpcode() == ISD::FNEG &&
32105 Op.getOperand(0).getOpcode() == AArch64ISD::MOVIedit &&
32106 Op.getOperand(0).getConstantOperandVal(0) == 0) ||
32107 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
32108 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
32110}
32111
32113 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
32114 Subtarget->hasComplxNum();
32115}
32116
32119 auto *VTy = dyn_cast<VectorType>(Ty);
32120 if (!VTy)
32121 return false;
32122
32123 // If the vector is scalable, SVE is enabled, implying support for complex
32124 // numbers. Otherwise, we need to ensure complex number support is available
32125 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
32126 return false;
32127
32128 auto *ScalarTy = VTy->getScalarType();
32129 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
32130
32131 // We can only process vectors that have a bit size of 128 or higher (with an
32132 // additional 64 bits for Neon). Additionally, these vectors must have a
32133 // power-of-2 size, as we later split them into the smallest supported size
32134 // and merging them back together after applying complex operation.
32135 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
32136 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
32137 !llvm::isPowerOf2_32(VTyWidth))
32138 return false;
32139
32140 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
32141 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
32142
32144 return ScalarWidth == 32 || ScalarWidth == 64;
32145 return 8 <= ScalarWidth && ScalarWidth <= 64;
32146 }
32147
32148 // CDot is not supported outside of scalable/sve scopes
32150 return false;
32151
32152 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
32153 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
32154}
32155
32158 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
32159 Value *Accumulator) const {
32160 VectorType *Ty = cast<VectorType>(InputA->getType());
32161 if (Accumulator == nullptr)
32163 bool IsScalable = Ty->isScalableTy();
32164 bool IsInt = Ty->getElementType()->isIntegerTy();
32165
32166 unsigned TyWidth =
32167 Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
32168
32169 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
32170 "Vector type must be either 64 or a power of 2 that is at least 128");
32171
32172 if (TyWidth > 128) {
32173 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
32174 int AccStride = cast<VectorType>(Accumulator->getType())
32175 ->getElementCount()
32176 .getKnownMinValue() /
32177 2;
32178 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
32179 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, uint64_t(0));
32180 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, uint64_t(0));
32181 auto *UpperSplitA = B.CreateExtractVector(HalfTy, InputA, Stride);
32182 auto *UpperSplitB = B.CreateExtractVector(HalfTy, InputB, Stride);
32183 Value *LowerSplitAcc = nullptr;
32184 Value *UpperSplitAcc = nullptr;
32185 Type *FullTy = Ty;
32186 FullTy = Accumulator->getType();
32187 auto *HalfAccTy = VectorType::getHalfElementsVectorType(
32188 cast<VectorType>(Accumulator->getType()));
32189 LowerSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, uint64_t(0));
32190 UpperSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, AccStride);
32191 auto *LowerSplitInt = createComplexDeinterleavingIR(
32192 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
32193 auto *UpperSplitInt = createComplexDeinterleavingIR(
32194 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
32195
32196 auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),
32197 LowerSplitInt, uint64_t(0));
32198 return B.CreateInsertVector(FullTy, Result, UpperSplitInt, AccStride);
32199 }
32200
32201 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
32202 if (IsScalable) {
32203 if (IsInt)
32204 return B.CreateIntrinsic(
32205 Intrinsic::aarch64_sve_cmla_x, Ty,
32206 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
32207
32208 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
32209 return B.CreateIntrinsic(
32210 Intrinsic::aarch64_sve_fcmla, Ty,
32211 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
32212 }
32213
32214 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
32215 Intrinsic::aarch64_neon_vcmla_rot90,
32216 Intrinsic::aarch64_neon_vcmla_rot180,
32217 Intrinsic::aarch64_neon_vcmla_rot270};
32218
32219
32220 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
32221 {Accumulator, InputA, InputB});
32222 }
32223
32224 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
32225 if (IsScalable) {
32228 if (IsInt)
32229 return B.CreateIntrinsic(
32230 Intrinsic::aarch64_sve_cadd_x, Ty,
32231 {InputA, InputB, B.getInt32((int)Rotation * 90)});
32232
32233 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
32234 return B.CreateIntrinsic(
32235 Intrinsic::aarch64_sve_fcadd, Ty,
32236 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
32237 }
32238 return nullptr;
32239 }
32240
32243 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
32245 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
32246
32247 if (IntId == Intrinsic::not_intrinsic)
32248 return nullptr;
32249
32250 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
32251 }
32252
32253 if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
32254 IsScalable) {
32255 return B.CreateIntrinsic(
32256 Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
32257 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
32258 }
32259
32260 return nullptr;
32261}
32262
32263bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
32264 unsigned Opc = N->getOpcode();
32265 if (ISD::isExtOpcode(Opc)) {
32266 if (any_of(N->users(),
32267 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
32268 return false;
32269 }
32270 return true;
32271}
32272
32274 return Subtarget->getMinimumJumpTableEntries();
32275}
32276
32278 CallingConv::ID CC,
32279 EVT VT) const {
32280 bool NonUnitFixedLengthVector =
32282 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
32283 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
32284
32285 EVT VT1;
32286 MVT RegisterVT;
32287 unsigned NumIntermediates;
32288 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
32289 RegisterVT);
32290 return RegisterVT;
32291}
32292
32294 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
32295 bool NonUnitFixedLengthVector =
32297 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
32298 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
32299
32300 EVT VT1;
32301 MVT VT2;
32302 unsigned NumIntermediates;
32303 return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
32304 NumIntermediates, VT2);
32305}
32306
32308 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
32309 unsigned &NumIntermediates, MVT &RegisterVT) const {
32311 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
32312 if (!RegisterVT.isFixedLengthVector() ||
32313 RegisterVT.getFixedSizeInBits() <= 128)
32314 return NumRegs;
32315
32316 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
32317 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
32318 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
32319
32320 // A size mismatch here implies either type promotion or widening and would
32321 // have resulted in scalarisation if larger vectors had not be available.
32322 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
32323 EVT EltTy = VT.getVectorElementType();
32324 EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
32325 if (!isTypeLegal(NewVT))
32326 NewVT = EltTy;
32327
32328 IntermediateVT = NewVT;
32329 NumIntermediates = VT.getVectorNumElements();
32330 RegisterVT = getRegisterType(Context, NewVT);
32331 return NumIntermediates;
32332 }
32333
32334 // SVE VLS support does not introduce a new ABI so we should use NEON sized
32335 // types for vector arguments and returns.
32336
32337 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
32338 NumIntermediates *= NumSubRegs;
32339 NumRegs *= NumSubRegs;
32340
32341 switch (RegisterVT.getVectorElementType().SimpleTy) {
32342 default:
32343 llvm_unreachable("unexpected element type for vector");
32344 case MVT::i8:
32345 IntermediateVT = RegisterVT = MVT::v16i8;
32346 break;
32347 case MVT::i16:
32348 IntermediateVT = RegisterVT = MVT::v8i16;
32349 break;
32350 case MVT::i32:
32351 IntermediateVT = RegisterVT = MVT::v4i32;
32352 break;
32353 case MVT::i64:
32354 IntermediateVT = RegisterVT = MVT::v2i64;
32355 break;
32356 case MVT::f16:
32357 IntermediateVT = RegisterVT = MVT::v8f16;
32358 break;
32359 case MVT::f32:
32360 IntermediateVT = RegisterVT = MVT::v4f32;
32361 break;
32362 case MVT::f64:
32363 IntermediateVT = RegisterVT = MVT::v2f64;
32364 break;
32365 case MVT::bf16:
32366 IntermediateVT = RegisterVT = MVT::v8bf16;
32367 break;
32368 }
32369
32370 return NumRegs;
32371}
32372
32374 const MachineFunction &MF) const {
32375 return !Subtarget->isTargetWindows() &&
32376 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
32377}
32378
32380 switch (Opc) {
32384 if (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)
32385 return true;
32386 }
32387
32389}
32390
32392 EVT VT) const {
32393 return Subtarget->hasCPA() && UseFEATCPACodegen;
32394}
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static SDValue trySVESplat64(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST, APInt &DefBits)
static SDValue tryLowerSmallVectorExtLoad(LoadSDNode *Load, SelectionDAG &DAG)
Helper function to optimize loads of extended small vectors.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static SDValue performVectorDeinterleaveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue performPTestFirstCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static const MachineInstr * stripVRegCopies(const MachineRegisterInfo &MRI, Register Reg)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue performSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
If the operand is a bitwise AND with a constant RHS, and the shift has a constant RHS and is the only...
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
bool isVectorizedBinOp(unsigned Opcode)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static SDValue emitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &DL, SelectionDAG &DAG)
Emit vector comparison for floating-point values, producing a mask.
static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG)
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static SDValue performExtractLastActiveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static bool shouldLowerTailCallStackArg(const MachineFunction &MF, const CCValAssign &VA, SDValue Arg, ISD::ArgFlagsTy Flags, int CallOffset)
Check whether a stack argument requires lowering in a tail call.
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static std::optional< ElementCount > getMaxValueForSVECntIntrinsic(SDValue Op)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerPtrAuthGlobalAddressStatically(SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC, SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static bool isCMP(SDValue Op)
return SDValue()
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget, const AtomicRMWInst *RMW)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
unsigned numberOfInstrToLoadImm(APInt C)
static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue convertFromSVEContainerType(SDLoc DL, SDValue Vec, EVT VecVT, SelectionDAG &DAG)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static bool callConvSupportsVarArgs(CallingConv::ID CC)
Return true if the call convention supports varargs Currently only those that pass varargs like the C...
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static bool IsSVECntIntrinsic(SDValue S)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL, const AArch64TargetLowering &TLI, const AArch64RegisterInfo &TRI, AArch64FunctionInfo &FuncInfo, SelectionDAG &DAG)
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static std::optional< std::pair< unsigned, const TargetRegisterClass * > > parseSVERegAsConstraint(StringRef Constraint)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue tryLowerToBSL(SDValue N, SelectionDAG &DAG)
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static bool isLane0KnownActive(SDValue Op)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
constexpr MVT CondCodeVT
Value type used for condition codes.
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static SDValue performSMINCombine(SDNode *N, SelectionDAG &DAG)
SDValue LowerVectorMatch(SDValue Op, SelectionDAG &DAG)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
bool isLegalCmpImmed(APInt C)
static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performCTPOPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue foldCSELofLASTB(SDNode *Op, SelectionDAG &DAG)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &DL)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Value * createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *ZExtTy, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC, SDValue RHS={})
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue performAddTruncShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *ST)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SMECallAttrs getSMECallAttrs(const Function &Caller, const RTLIB::RuntimeLibcallsInfo &RTLCI, const TargetLowering::CallLoweringInfo &CLI)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtInReg(const SDValue &V)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Tries to replace scalar FP <-> INT conversions with SVE in streaming functions, this can help to redu...
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue getCondCode(SelectionDAG &DAG, AArch64CC::CondCode CC)
Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue optimizeIncrementingWhile(SDNode *N, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
cl::opt< bool > EnableSVEGISel("aarch64-enable-gisel-sve", cl::Hidden, cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false))
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerIntNeonIntrinsic(SDValue Op, unsigned Opcode, SelectionDAG &DAG)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
static SDValue performSubWithBorrowCombine(SDNode *N, SelectionDAG &DAG)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool canEmitConjunction(SelectionDAG &DAG, const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool &PreferFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, ISD::CondCode CC, bool NoNaNs, const SDLoc &DL, SelectionDAG &DAG)
For SELECT_CC, when the true/false values are (-1, 0) and the compared values are scalars,...
static SDValue getZT0FrameIndex(MachineFrameInfo &MFI, AArch64FunctionInfo &FuncInfo, SelectionDAG &DAG)
static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertToSVEContainerType(SDLoc DL, SDValue Vec, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static bool shouldBeAdjustedToZero(SDValue LHS, APInt C, ISD::CondCode &CC)
static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static AArch64SME::ToggleCondition getSMToggleCondition(const SMECallAttrs &CallAttrs)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue isNVCastToHalfWidthElements(SDValue V)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isEligibleForSmallVectorLoadOpt(LoadSDNode *LD, const AArch64Subtarget &Subtarget)
Helper function to check if a small vector load can be optimized.
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static SDValue performMulRdsvlCombine(SDNode *Mul, SelectionDAG &DAG)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static Value * createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *DstTy, bool IsLittleEndian)
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallBase &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, SelectionDAG &DAG, AArch64FunctionInfo *Info, SDLoc DL, SDValue Chain, bool IsSave)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const SDLoc DL)
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static cl::opt< bool > UseFEATCPACodegen("aarch64-use-featcpa-codegen", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in " "SelectionDAG for FEAT_CPA"), cl::init(false))
static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, unsigned NumElts, bool IsLittleEndian, SmallVectorImpl< int > &Mask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
static bool isConstant(const MachineInstr &MI)
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
basic Basic Alias true
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
@ Default
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
This file provides utility analysis objects describing memory locations.
#define T
This file defines ARC utility functions which are used by various parts of the compiler.
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
static LLVM_ATTRIBUTE_ALWAYS_INLINE MVT::SimpleValueType getSimpleVT(const unsigned char *MatcherTable, unsigned &MatcherIndex)
getSimpleVT - Decode a value in MatcherTable, if it's a VBR encoded value, use GetVBR to decode it.
This file defines the SmallSet class.
This file defines less commonly used SmallVector utilities.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
static llvm::Type * getVectorElementType(llvm::Type *Ty)
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
The Input class is used to parse a yaml document into in-memory structs and vectors.
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getMaximumJumpTableSize() const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
bool isStreamingCompatible() const
Returns true if the function has a streaming-compatible body.
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isStreaming() const
Returns true if the function has a streaming body.
unsigned getMaxSVEVectorSizeInBits() const
bool isCallingConvWin64(CallingConv::ID CC, bool IsVarArg) const
unsigned getMinSVEVectorSizeInBits() const
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, bool InsertVectorLengthCheck=false) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock *BB) const
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a stN intrinsic.
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool preferSelectsOverBooleanArithmetic(EVT VT) const override
Should we prefer selects to doing arithmetic on boolean types.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool shouldOptimizeMulOverflowWithZeroHighBits(LLVMContext &Context, EVT VT) const override
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void fixupPtrauthDiscriminator(MachineInstr &MI, MachineBasicBlock *BB, MachineOperand &IntDiscOp, MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const
Replace (0, vreg) discriminator components with the operands of blend or with (immediate,...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a ldN intrinsic.
bool fallBackToDAGISel(const Instruction &Inst) const override
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
MachineBasicBlock * EmitCheckMatchingVL(MachineInstr &MI, MachineBasicBlock *MBB) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override
Return true if the @llvm.experimental.vector.match intrinsic should be expanded for vector type ‘VT’ ...
MachineBasicBlock * EmitEntryPStateSM(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
In AArch64, true if FEAT_CPA is present.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
MachineBasicBlock * EmitAllocateSMESaveBuffer(MachineInstr &MI, MachineBasicBlock *BB) const
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
MachineBasicBlock * EmitAllocateZABuffer(MachineInstr &MI, MachineBasicBlock *BB) const
const AArch64TargetMachine & getTM() const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool isOpSuitableForLDPSTP(const Instruction *I) const
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
MachineBasicBlock * EmitGetSMESaveSize(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
bool lowerInterleaveIntrinsicToStore(Instruction *Store, Value *Mask, ArrayRef< Value * > InterleaveValues) const override
Lower an interleave intrinsic to a target specific store intrinsic.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
bool useNewSMEABILowering() const
Returns true if the new SME ABI lowering should be used.
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:290
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:644
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:230
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:424
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition APInt.cpp:1890
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1392
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1671
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition APInt.cpp:639
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
LLVM_ABI APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition APInt.cpp:1928
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition APInt.h:1167
LLVM_ABI APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition APInt.cpp:1935
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1640
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
unsigned logBase2() const
Definition APInt.h:1762
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:828
bool isMask(unsigned numBits) const
Definition APInt.h:489
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:335
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1258
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1238
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
an instruction to allocate memory on the stack
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
@ Add
*p = old + v
@ FAdd
*p = old + v
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
bool isFloatingPointOperation() const
BinOp getOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const BlockAddress * getBlockAddress() const
Function * getFunction() const
Definition Constants.h:940
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
LLVM_ABI std::optional< std::pair< APInt, APInt > > isConstantSequence() const
If this BuildVector is constant and represents the numerical series "<a, a+n, a+2n,...
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
LLVM_ABI bool isConstant() const
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:214
bool isBigEndian() const
Definition DataLayout.h:215
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:123
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
static FixedVectorType * getInteger(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
arg_iterator arg_end()
Definition Function.h:875
arg_iterator arg_begin()
Definition Function.h:866
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
const Argument * const_arg_iterator
Definition Function.h:73
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:730
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
bool hasExternalWeakLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:132
Type * getValueType() const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition IRBuilder.h:1939
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2254
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2511
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition IRBuilder.h:552
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getScalableVectorVT(MVT VT, unsigned NumElements)
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MachineInstr * remove_instr(MachineInstr *I)
Remove the possibly bundled instruction from the instruction list without deleting it.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
bool hasScalableStackID(int ObjectIdx) const
bool isImmutableObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to an immutable object.
int getStackProtectorIndex() const
Return the index for the stack protector object.
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
LLVM_ABI int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
bool use_empty(Register RegNo) const
use_empty - Return true if there are no instructions using the specified register.
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition MapVector.h:56
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
const SDValue & getInc() const
const SDValue & getScale() const
const SDValue & getMask() const
const SDValue & getIntID() const
const SDValue & getIndex() const
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition Module.cpp:717
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
void dropFlags(unsigned Mask)
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAssert() const
Test if this node is an assert operation.
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasAgnosticZAInterface() const
bool hasStreamingInterfaceOrBody() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasSharedZAInterface() const
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresEnablingZAAfterCall() const
bool requiresPreservingZT0() const
bool requiresDisablingZABeforeCall() const
bool requiresPreservingAllZAState() const
Class to represent scalable SIMD vectors.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC)
LLVM_ABI Align getReducedAlign(EVT VT, bool UseABI)
In most cases this function returns the ABI alignment for a given type, except for illegal vector typ...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getNeutralElement(unsigned Opcode, const SDLoc &DL, EVT VT, SDNodeFlags Flags)
Get the (commutative) neutral element for the given opcode, if it exists.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags, bool AllowCommute=false)
Get the specified node if it's already available, or else return NULL.
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
LLVM_ABI SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType)
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI SDValue getTypeSize(const SDLoc &DL, EVT VT, TypeSize TS)
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
void addCalledGlobal(const SDNode *Node, const GlobalValue *GV, unsigned OpFlags)
Set CalledGlobal to be associated with Node.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
LLVM_ABI SDValue getDeactivationSymbol(const GlobalValue *GV)
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:472
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition StringRef.h:573
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition StringRef.h:261
StringRef drop_front(size_t N=1) const
Return a StringRef equal to 'this' but with the first N elements dropped.
Definition StringRef.h:611
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition StringRef.h:686
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:413
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool shouldExpandBuildVectorWithShuffles(EVT, unsigned DefinedValues) const
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
virtual unsigned getMinimumJumpTableEntries() const
Return lower limit for number of blocks in a jump table.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setPartialReduceMLAAction(unsigned Opc, MVT AccVT, MVT InputVT, LegalizeAction Action)
Indicate how a PARTIAL_REDUCE_U/SMLA node with Acc type AccVT and Input type InputVT should be treate...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool useLoadStackGuardNode(const Module &M) const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
unsigned getPointerSize(unsigned AS) const
Get the pointer size for this target.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned EmitCallGraphSection
Emit section containing call graph metadata.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
LLVM_ABI InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
This class represents a truncation of integer types.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt128Ty(LLVMContext &C)
Definition Type.cpp:298
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
@ HalfTyID
16-bit floating point type
Definition Type.h:56
@ FloatTyID
32-bit floating point type
Definition Type.h:58
@ BFloatTyID
16-bit floating point type (7-bit significand)
Definition Type.h:57
@ DoubleTyID
64-bit floating point type
Definition Type.h:59
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:280
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:295
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:285
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
static LLVM_ABI Type * getBFloatTy(LLVMContext &C)
Definition Type.cpp:283
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:282
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:759
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void dump() const
Support for debugging, callable in GDB: V->dump()
Base class of all SIMD vector types.
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isValidCBCond(AArch64CC::CondCode Code)
True, if a given condition code can be used in a fused compare-and-branch instructions,...
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint64_t decodeAdvSIMDModImmType10(uint8_t Imm)
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isSVECpyDupImm(int SizeInBits, int64_t Val, int32_t &Imm, int32_t &Shift)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
const uint64_t ReservedFPControlBits
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1
Preserve X1-X15, X19-X29, SP, Z0-Z31, P0-P15.
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ PreserveNone
Used for runtime calls that preserves none general registers.
Definition CallingConv.h:90
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNormalMaskedLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed masked load.
bool isNormalMaskedStore(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed masked store.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ LOOP_DEPENDENCE_RAW_MASK
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:744
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition ISDOpcodes.h:898
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:712
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:662
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ TRUNCATE_SSAT_U
Definition ISDOpcodes.h:861
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:815
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor to...
Definition ISDOpcodes.h:628
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition ISDOpcodes.h:688
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:534
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:669
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:958
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ PtrAuthGlobalAddress
A ptrauth constant.
Definition ISDOpcodes.h:100
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:134
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:887
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition ISDOpcodes.h:633
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:707
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition ISDOpcodes.h:654
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:696
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:909
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:933
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:821
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor ...
Definition ISDOpcodes.h:617
@ TRUNCATE_SSAT_S
TRUNCATE_[SU]SAT_[SU] - Truncate for saturated operand [SU] located in middle, prefix for SAT means i...
Definition ISDOpcodes.h:859
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:719
@ TRUNCATE_USAT_U
Definition ISDOpcodes.h:863
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549
@ LOOP_DEPENDENCE_WAR_MASK
Set rounding mode.
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LLVM_ABI NodeType getVecReduceBaseOpcode(unsigned VecReduceOpcode)
Get underlying scalar opcode for VECREDUCE opcode.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
bool match(Val *V, const Pattern &P)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
CastInst_match< OpTy, UIToFPInst > m_UIToFP(const OpTy &Op)
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
const unsigned VectorBits
Definition SystemZ.h:155
initializer< Ty > init(const Ty &Val)
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition ObjCARCUtil.h:43
bool attachedCallOpBundleNeedsMarker(const CallBase *CB)
This function determines whether the clang_arc_attachedcall should be emitted with or without the mar...
Definition ObjCARCUtil.h:58
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
bool isPackedVectorType(EVT SomeVT)
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
LLVM_ABI void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:295
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition APFloat.h:1545
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
auto map_to_vector(ContainerTy &&C, FuncTy &&F)
Map a range to a SmallVector with element types deduced from the mapping.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1593
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI void reportFatalInternalError(Error Err)
Report a fatal error that indicates a bug in LLVM.
Definition Error.cpp:177
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
unsigned M1(unsigned Val)
Definition VE.h:377
bool isReleaseOrStronger(AtomicOrdering AO)
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID)
Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
generic_gep_type_iterator<> gep_type_iterator
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:261
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
Definition ModRef.h:68
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
TargetTransformInfo TTI
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
CombineLevel
Definition DAGCombine.h:15
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI VectorType * getDeinterleavedVectorType(IntrinsicInst *DI)
Given a deinterleaveN intrinsic, return the (narrow) vector type of each factor.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1973
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
gep_type_iterator gep_type_begin(const User *GEP)
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2132
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1909
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2120
static const MachineMemOperand::Flags MOStridedAccess
@ Enabled
Convert any .debug_str_offsets tables to DWARF64 if needed.
Definition DWP.h:27
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:198
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
bool CC_AArch64_Preserve_None(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static const unsigned PerfectShuffleTable[6561+1]
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0,...
@ Enable
Enable colors.
Definition WithColor.h:47
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:180
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
Helper structure to be able to read SetCC information.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
uint64_t getScalarStoreSize() const
Definition ValueTypes.h:402
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:430
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition ValueTypes.h:444
bool isScalableVT() const
Return true if the type is a scalable type.
Definition ValueTypes.h:187
bool isFixedLengthVector() const
Definition ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:292
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
static LLVM_ABI KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:135
static LLVM_ABI KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
A simple container for information about the supported runtime calls.
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64