LLVM 22.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
80namespace {
81class TailFoldingOption {
82 // These bitfields will only ever be set to something non-zero in operator=,
83 // when setting the -sve-tail-folding option. This option should always be of
84 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
85 // InitialBits is one of (disabled|all|simple). EnableBits represents
86 // additional flags we're enabling, and DisableBits for those flags we're
87 // disabling. The default flag is tracked in the variable NeedsDefault, since
88 // at the time of setting the option we may not know what the default value
89 // for the CPU is.
93
94 // This value needs to be initialised to true in case the user does not
95 // explicitly set the -sve-tail-folding option.
96 bool NeedsDefault = true;
97
98 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
99
100 void setNeedsDefault(bool V) { NeedsDefault = V; }
101
102 void setEnableBit(TailFoldingOpts Bit) {
103 EnableBits |= Bit;
104 DisableBits &= ~Bit;
105 }
106
107 void setDisableBit(TailFoldingOpts Bit) {
108 EnableBits &= ~Bit;
109 DisableBits |= Bit;
110 }
111
112 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
113 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
114
115 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
116 "Initial bits should only include one of "
117 "(disabled|all|simple|default)");
118 Bits = NeedsDefault ? DefaultBits : InitialBits;
119 Bits |= EnableBits;
120 Bits &= ~DisableBits;
121
122 return Bits;
123 }
124
125 void reportError(std::string Opt) {
126 errs() << "invalid argument '" << Opt
127 << "' to -sve-tail-folding=; the option should be of the form\n"
128 " (disabled|all|default|simple)[+(reductions|recurrences"
129 "|reverse|noreductions|norecurrences|noreverse)]\n";
130 report_fatal_error("Unrecognised tail-folding option");
131 }
132
133public:
134
135 void operator=(const std::string &Val) {
136 // If the user explicitly sets -sve-tail-folding= then treat as an error.
137 if (Val.empty()) {
138 reportError("");
139 return;
140 }
141
142 // Since the user is explicitly setting the option we don't automatically
143 // need the default unless they require it.
144 setNeedsDefault(false);
145
146 SmallVector<StringRef, 4> TailFoldTypes;
147 StringRef(Val).split(TailFoldTypes, '+', -1, false);
148
149 unsigned StartIdx = 1;
150 if (TailFoldTypes[0] == "disabled")
151 setInitialBits(TailFoldingOpts::Disabled);
152 else if (TailFoldTypes[0] == "all")
153 setInitialBits(TailFoldingOpts::All);
154 else if (TailFoldTypes[0] == "default")
155 setNeedsDefault(true);
156 else if (TailFoldTypes[0] == "simple")
157 setInitialBits(TailFoldingOpts::Simple);
158 else {
159 StartIdx = 0;
160 setInitialBits(TailFoldingOpts::Disabled);
161 }
162
163 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
164 if (TailFoldTypes[I] == "reductions")
165 setEnableBit(TailFoldingOpts::Reductions);
166 else if (TailFoldTypes[I] == "recurrences")
167 setEnableBit(TailFoldingOpts::Recurrences);
168 else if (TailFoldTypes[I] == "reverse")
169 setEnableBit(TailFoldingOpts::Reverse);
170 else if (TailFoldTypes[I] == "noreductions")
171 setDisableBit(TailFoldingOpts::Reductions);
172 else if (TailFoldTypes[I] == "norecurrences")
173 setDisableBit(TailFoldingOpts::Recurrences);
174 else if (TailFoldTypes[I] == "noreverse")
175 setDisableBit(TailFoldingOpts::Reverse);
176 else
177 reportError(Val);
178 }
179 }
180
181 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
182 return (getBits(DefaultBits) & Required) == Required;
183 }
184};
185} // namespace
186
187TailFoldingOption TailFoldingOptionLoc;
188
190 "sve-tail-folding",
191 cl::desc(
192 "Control the use of vectorisation using tail-folding for SVE where the"
193 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
194 "\ndisabled (Initial) No loop types will vectorize using "
195 "tail-folding"
196 "\ndefault (Initial) Uses the default tail-folding settings for "
197 "the target CPU"
198 "\nall (Initial) All legal loop types will vectorize using "
199 "tail-folding"
200 "\nsimple (Initial) Use tail-folding for simple loops (not "
201 "reductions or recurrences)"
202 "\nreductions Use tail-folding for loops containing reductions"
203 "\nnoreductions Inverse of above"
204 "\nrecurrences Use tail-folding for loops containing fixed order "
205 "recurrences"
206 "\nnorecurrences Inverse of above"
207 "\nreverse Use tail-folding for loops requiring reversed "
208 "predicates"
209 "\nnoreverse Inverse of above"),
211
212// Experimental option that will only be fully functional when the
213// code-generator is changed to use SVE instead of NEON for all fixed-width
214// operations.
216 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
217
218// Experimental option that will only be fully functional when the cost-model
219// and code-generator have been changed to avoid using scalable vector
220// instructions that are not legal in streaming SVE mode.
222 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
223
224static bool isSMEABIRoutineCall(const CallInst &CI,
225 const AArch64TargetLowering &TLI) {
226 const auto *F = CI.getCalledFunction();
227 return F && SMEAttrs(F->getName(), TLI).isSMEABIRoutine();
228}
229
230/// Returns true if the function has explicit operations that can only be
231/// lowered using incompatible instructions for the selected mode. This also
232/// returns true if the function F may use or modify ZA state.
234 const AArch64TargetLowering &TLI) {
235 for (const BasicBlock &BB : *F) {
236 for (const Instruction &I : BB) {
237 // Be conservative for now and assume that any call to inline asm or to
238 // intrinsics could could result in non-streaming ops (e.g. calls to
239 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
240 // all native LLVM instructions can be lowered to compatible instructions.
241 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
242 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
244 return true;
245 }
246 }
247 return false;
248}
249
251 StringRef AttributeStr =
252 isMultiversionedFunction(F) ? "fmv-features" : "target-features";
253 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
255 FeatureStr.split(Features, ",");
256 return AArch64::getFMVPriority(Features);
257}
258
260 return F.hasFnAttribute("fmv-features");
261}
262
263const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
264 AArch64::FeatureExecuteOnly,
265};
266
268 const Function *Callee) const {
269 SMECallAttrs CallAttrs(*Caller, *Callee);
270
271 // Never inline a function explicitly marked as being streaming,
272 // into a non-streaming function. Assume it was marked as streaming
273 // for a reason.
274 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
276 return false;
277
278 // When inlining, we should consider the body of the function, not the
279 // interface.
280 if (CallAttrs.callee().hasStreamingBody()) {
281 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
282 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
283 }
284
285 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
286 return false;
287
288 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
289 CallAttrs.requiresPreservingZT0() ||
290 CallAttrs.requiresPreservingAllZAState()) {
291 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
292 return false;
293 }
294
295 const TargetMachine &TM = getTLI()->getTargetMachine();
296 const FeatureBitset &CallerBits =
297 TM.getSubtargetImpl(*Caller)->getFeatureBits();
298 const FeatureBitset &CalleeBits =
299 TM.getSubtargetImpl(*Callee)->getFeatureBits();
300 // Adjust the feature bitsets by inverting some of the bits. This is needed
301 // for target features that represent restrictions rather than capabilities,
302 // for example a "+execute-only" callee can be inlined into a caller without
303 // "+execute-only", but not vice versa.
304 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
305 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
306
307 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
308}
309
311 const Function *Caller, const Function *Callee,
312 const ArrayRef<Type *> &Types) const {
313 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
314 return false;
315
316 // We need to ensure that argument promotion does not attempt to promote
317 // pointers to fixed-length vector types larger than 128 bits like
318 // <8 x float> (and pointers to aggregate types which have such fixed-length
319 // vector type members) into the values of the pointees. Such vector types
320 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
321 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
322 // types can be safely treated as 128-bit NEON types and they cannot be
323 // distinguished in IR.
324 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
325 auto FVTy = dyn_cast<FixedVectorType>(Ty);
326 return FVTy &&
327 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
328 }))
329 return false;
330
331 return true;
332}
333
334unsigned
336 unsigned DefaultCallPenalty) const {
337 // This function calculates a penalty for executing Call in F.
338 //
339 // There are two ways this function can be called:
340 // (1) F:
341 // call from F -> G (the call here is Call)
342 //
343 // For (1), Call.getCaller() == F, so it will always return a high cost if
344 // a streaming-mode change is required (thus promoting the need to inline the
345 // function)
346 //
347 // (2) F:
348 // call from F -> G (the call here is not Call)
349 // G:
350 // call from G -> H (the call here is Call)
351 //
352 // For (2), if after inlining the body of G into F the call to H requires a
353 // streaming-mode change, and the call to G from F would also require a
354 // streaming-mode change, then there is benefit to do the streaming-mode
355 // change only once and avoid inlining of G into F.
356
357 SMEAttrs FAttrs(*F);
358 SMECallAttrs CallAttrs(Call, getTLI());
359
360 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
361 if (F == Call.getCaller()) // (1)
362 return CallPenaltyChangeSM * DefaultCallPenalty;
363 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
364 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
365 }
366
367 return DefaultCallPenalty;
368}
369
376
377/// Calculate the cost of materializing a 64-bit value. This helper
378/// method might only calculate a fraction of a larger immediate. Therefore it
379/// is valid to return a cost of ZERO.
381 // Check if the immediate can be encoded within an instruction.
382 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
383 return 0;
384
385 if (Val < 0)
386 Val = ~Val;
387
388 // Calculate how many moves we will need to materialize this constant.
390 AArch64_IMM::expandMOVImm(Val, 64, Insn);
391 return Insn.size();
392}
393
394/// Calculate the cost of materializing the given constant.
398 assert(Ty->isIntegerTy());
399
400 unsigned BitSize = Ty->getPrimitiveSizeInBits();
401 if (BitSize == 0)
402 return ~0U;
403
404 // Sign-extend all constants to a multiple of 64-bit.
405 APInt ImmVal = Imm;
406 if (BitSize & 0x3f)
407 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
408
409 // Split the constant into 64-bit chunks and calculate the cost for each
410 // chunk.
412 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
413 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
414 int64_t Val = Tmp.getSExtValue();
415 Cost += getIntImmCost(Val);
416 }
417 // We need at least one instruction to materialze the constant.
418 return std::max<InstructionCost>(1, Cost);
419}
420
422 const APInt &Imm, Type *Ty,
424 Instruction *Inst) const {
425 assert(Ty->isIntegerTy());
426
427 unsigned BitSize = Ty->getPrimitiveSizeInBits();
428 // There is no cost model for constants with a bit size of 0. Return TCC_Free
429 // here, so that constant hoisting will ignore this constant.
430 if (BitSize == 0)
431 return TTI::TCC_Free;
432
433 unsigned ImmIdx = ~0U;
434 switch (Opcode) {
435 default:
436 return TTI::TCC_Free;
437 case Instruction::GetElementPtr:
438 // Always hoist the base address of a GetElementPtr.
439 if (Idx == 0)
440 return 2 * TTI::TCC_Basic;
441 return TTI::TCC_Free;
442 case Instruction::Store:
443 ImmIdx = 0;
444 break;
445 case Instruction::Add:
446 case Instruction::Sub:
447 case Instruction::Mul:
448 case Instruction::UDiv:
449 case Instruction::SDiv:
450 case Instruction::URem:
451 case Instruction::SRem:
452 case Instruction::And:
453 case Instruction::Or:
454 case Instruction::Xor:
455 case Instruction::ICmp:
456 ImmIdx = 1;
457 break;
458 // Always return TCC_Free for the shift value of a shift instruction.
459 case Instruction::Shl:
460 case Instruction::LShr:
461 case Instruction::AShr:
462 if (Idx == 1)
463 return TTI::TCC_Free;
464 break;
465 case Instruction::Trunc:
466 case Instruction::ZExt:
467 case Instruction::SExt:
468 case Instruction::IntToPtr:
469 case Instruction::PtrToInt:
470 case Instruction::BitCast:
471 case Instruction::PHI:
472 case Instruction::Call:
473 case Instruction::Select:
474 case Instruction::Ret:
475 case Instruction::Load:
476 break;
477 }
478
479 if (Idx == ImmIdx) {
480 int NumConstants = (BitSize + 63) / 64;
482 return (Cost <= NumConstants * TTI::TCC_Basic)
483 ? static_cast<int>(TTI::TCC_Free)
484 : Cost;
485 }
487}
488
491 const APInt &Imm, Type *Ty,
493 assert(Ty->isIntegerTy());
494
495 unsigned BitSize = Ty->getPrimitiveSizeInBits();
496 // There is no cost model for constants with a bit size of 0. Return TCC_Free
497 // here, so that constant hoisting will ignore this constant.
498 if (BitSize == 0)
499 return TTI::TCC_Free;
500
501 // Most (all?) AArch64 intrinsics do not support folding immediates into the
502 // selected instruction, so we compute the materialization cost for the
503 // immediate directly.
504 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
506
507 switch (IID) {
508 default:
509 return TTI::TCC_Free;
510 case Intrinsic::sadd_with_overflow:
511 case Intrinsic::uadd_with_overflow:
512 case Intrinsic::ssub_with_overflow:
513 case Intrinsic::usub_with_overflow:
514 case Intrinsic::smul_with_overflow:
515 case Intrinsic::umul_with_overflow:
516 if (Idx == 1) {
517 int NumConstants = (BitSize + 63) / 64;
519 return (Cost <= NumConstants * TTI::TCC_Basic)
520 ? static_cast<int>(TTI::TCC_Free)
521 : Cost;
522 }
523 break;
524 case Intrinsic::experimental_stackmap:
525 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
526 return TTI::TCC_Free;
527 break;
528 case Intrinsic::experimental_patchpoint_void:
529 case Intrinsic::experimental_patchpoint:
530 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
531 return TTI::TCC_Free;
532 break;
533 case Intrinsic::experimental_gc_statepoint:
534 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
535 return TTI::TCC_Free;
536 break;
537 }
539}
540
542AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
543 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
544 if (TyWidth == 32 || TyWidth == 64)
546 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
547 return TTI::PSK_Software;
548}
549
550static bool isUnpackedVectorVT(EVT VecVT) {
551 return VecVT.isScalableVector() &&
553}
554
556 const IntrinsicCostAttributes &ICA) {
557 // We need to know at least the number of elements in the vector of buckets
558 // and the size of each element to update.
559 if (ICA.getArgTypes().size() < 2)
561
562 // Only interested in costing for the hardware instruction from SVE2.
563 if (!ST->hasSVE2())
565
566 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
567 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
568 unsigned TotalHistCnts = 1;
569
570 unsigned EltSize = EltTy->getScalarSizeInBits();
571 // Only allow (up to 64b) integers or pointers
572 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
574
575 // FIXME: We should be able to generate histcnt for fixed-length vectors
576 // using ptrue with a specific VL.
577 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
578 unsigned EC = VTy->getElementCount().getKnownMinValue();
579 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
581
582 // HistCnt only supports 32b and 64b element types
583 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
584
585 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
587
588 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
589 TotalHistCnts = EC / NaturalVectorWidth;
590
591 return InstructionCost(BaseHistCntCost * TotalHistCnts);
592 }
593
595}
596
600 // The code-generator is currently not able to handle scalable vectors
601 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
602 // it. This change will be removed when code-generation for these types is
603 // sufficiently reliable.
604 auto *RetTy = ICA.getReturnType();
605 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
606 if (VTy->getElementCount() == ElementCount::getScalable(1))
608
609 switch (ICA.getID()) {
610 case Intrinsic::experimental_vector_histogram_add: {
611 InstructionCost HistCost = getHistogramCost(ST, ICA);
612 // If the cost isn't valid, we may still be able to scalarize
613 if (HistCost.isValid())
614 return HistCost;
615 break;
616 }
617 case Intrinsic::umin:
618 case Intrinsic::umax:
619 case Intrinsic::smin:
620 case Intrinsic::smax: {
621 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
622 MVT::v8i16, MVT::v2i32, MVT::v4i32,
623 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
624 MVT::nxv2i64};
625 auto LT = getTypeLegalizationCost(RetTy);
626 // v2i64 types get converted to cmp+bif hence the cost of 2
627 if (LT.second == MVT::v2i64)
628 return LT.first * 2;
629 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
630 return LT.first;
631 break;
632 }
633 case Intrinsic::sadd_sat:
634 case Intrinsic::ssub_sat:
635 case Intrinsic::uadd_sat:
636 case Intrinsic::usub_sat: {
637 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
638 MVT::v8i16, MVT::v2i32, MVT::v4i32,
639 MVT::v2i64};
640 auto LT = getTypeLegalizationCost(RetTy);
641 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
642 // need to extend the type, as it uses shr(qadd(shl, shl)).
643 unsigned Instrs =
644 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
645 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
646 return LT.first * Instrs;
647
649 uint64_t VectorSize = TS.getKnownMinValue();
650
651 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
652 return LT.first * Instrs;
653
654 break;
655 }
656 case Intrinsic::abs: {
657 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
658 MVT::v8i16, MVT::v2i32, MVT::v4i32,
659 MVT::v2i64};
660 auto LT = getTypeLegalizationCost(RetTy);
661 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
662 return LT.first;
663 break;
664 }
665 case Intrinsic::bswap: {
666 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
667 MVT::v4i32, MVT::v2i64};
668 auto LT = getTypeLegalizationCost(RetTy);
669 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
670 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
671 return LT.first;
672 break;
673 }
674 case Intrinsic::fma:
675 case Intrinsic::fmuladd: {
676 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
677 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
678 Type *EltTy = RetTy->getScalarType();
679 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
680 (EltTy->isHalfTy() && ST->hasFullFP16()))
681 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
682 break;
683 }
684 case Intrinsic::stepvector: {
685 InstructionCost Cost = 1; // Cost of the `index' instruction
686 auto LT = getTypeLegalizationCost(RetTy);
687 // Legalisation of illegal vectors involves an `index' instruction plus
688 // (LT.first - 1) vector adds.
689 if (LT.first > 1) {
690 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
691 InstructionCost AddCost =
692 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
693 Cost += AddCost * (LT.first - 1);
694 }
695 return Cost;
696 }
697 case Intrinsic::vector_extract:
698 case Intrinsic::vector_insert: {
699 // If both the vector and subvector types are legal types and the index
700 // is 0, then this should be a no-op or simple operation; return a
701 // relatively low cost.
702
703 // If arguments aren't actually supplied, then we cannot determine the
704 // value of the index. We also want to skip predicate types.
705 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
707 break;
708
709 LLVMContext &C = RetTy->getContext();
710 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
711 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
712 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
713 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
714 // Skip this if either the vector or subvector types are unpacked
715 // SVE types; they may get lowered to stack stores and loads.
716 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
717 break;
718
720 getTLI()->getTypeConversion(C, SubVecVT);
722 getTLI()->getTypeConversion(C, VecVT);
723 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
724 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
725 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
726 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
727 return TTI::TCC_Free;
728 break;
729 }
730 case Intrinsic::bitreverse: {
731 static const CostTblEntry BitreverseTbl[] = {
732 {Intrinsic::bitreverse, MVT::i32, 1},
733 {Intrinsic::bitreverse, MVT::i64, 1},
734 {Intrinsic::bitreverse, MVT::v8i8, 1},
735 {Intrinsic::bitreverse, MVT::v16i8, 1},
736 {Intrinsic::bitreverse, MVT::v4i16, 2},
737 {Intrinsic::bitreverse, MVT::v8i16, 2},
738 {Intrinsic::bitreverse, MVT::v2i32, 2},
739 {Intrinsic::bitreverse, MVT::v4i32, 2},
740 {Intrinsic::bitreverse, MVT::v1i64, 2},
741 {Intrinsic::bitreverse, MVT::v2i64, 2},
742 };
743 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
744 const auto *Entry =
745 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
746 if (Entry) {
747 // Cost Model is using the legal type(i32) that i8 and i16 will be
748 // converted to +1 so that we match the actual lowering cost
749 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
750 TLI->getValueType(DL, RetTy, true) == MVT::i16)
751 return LegalisationCost.first * Entry->Cost + 1;
752
753 return LegalisationCost.first * Entry->Cost;
754 }
755 break;
756 }
757 case Intrinsic::ctpop: {
758 if (!ST->hasNEON()) {
759 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
760 return getTypeLegalizationCost(RetTy).first * 12;
761 }
762 static const CostTblEntry CtpopCostTbl[] = {
763 {ISD::CTPOP, MVT::v2i64, 4},
764 {ISD::CTPOP, MVT::v4i32, 3},
765 {ISD::CTPOP, MVT::v8i16, 2},
766 {ISD::CTPOP, MVT::v16i8, 1},
767 {ISD::CTPOP, MVT::i64, 4},
768 {ISD::CTPOP, MVT::v2i32, 3},
769 {ISD::CTPOP, MVT::v4i16, 2},
770 {ISD::CTPOP, MVT::v8i8, 1},
771 {ISD::CTPOP, MVT::i32, 5},
772 };
773 auto LT = getTypeLegalizationCost(RetTy);
774 MVT MTy = LT.second;
775 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
776 // Extra cost of +1 when illegal vector types are legalized by promoting
777 // the integer type.
778 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
779 RetTy->getScalarSizeInBits()
780 ? 1
781 : 0;
782 return LT.first * Entry->Cost + ExtraCost;
783 }
784 break;
785 }
786 case Intrinsic::sadd_with_overflow:
787 case Intrinsic::uadd_with_overflow:
788 case Intrinsic::ssub_with_overflow:
789 case Intrinsic::usub_with_overflow:
790 case Intrinsic::smul_with_overflow:
791 case Intrinsic::umul_with_overflow: {
792 static const CostTblEntry WithOverflowCostTbl[] = {
793 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
794 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
795 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
796 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
797 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
798 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
799 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
800 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
801 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
802 {Intrinsic::usub_with_overflow, MVT::i8, 3},
803 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
804 {Intrinsic::usub_with_overflow, MVT::i16, 3},
805 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
806 {Intrinsic::usub_with_overflow, MVT::i32, 1},
807 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
808 {Intrinsic::usub_with_overflow, MVT::i64, 1},
809 {Intrinsic::smul_with_overflow, MVT::i8, 5},
810 {Intrinsic::umul_with_overflow, MVT::i8, 4},
811 {Intrinsic::smul_with_overflow, MVT::i16, 5},
812 {Intrinsic::umul_with_overflow, MVT::i16, 4},
813 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
814 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
815 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
816 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
817 };
818 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
819 if (MTy.isSimple())
820 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
821 MTy.getSimpleVT()))
822 return Entry->Cost;
823 break;
824 }
825 case Intrinsic::fptosi_sat:
826 case Intrinsic::fptoui_sat: {
827 if (ICA.getArgTypes().empty())
828 break;
829 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
830 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
831 EVT MTy = TLI->getValueType(DL, RetTy);
832 // Check for the legal types, which are where the size of the input and the
833 // output are the same, or we are using cvt f64->i32 or f32->i64.
834 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
835 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
836 LT.second == MVT::v2f64)) {
837 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
838 (LT.second == MVT::f64 && MTy == MVT::i32) ||
839 (LT.second == MVT::f32 && MTy == MVT::i64)))
840 return LT.first;
841 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
842 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
843 MTy.getScalarSizeInBits() == 64)
844 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
845 }
846 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
847 // f32.
848 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
849 return LT.first + getIntrinsicInstrCost(
850 {ICA.getID(),
851 RetTy,
852 {ICA.getArgTypes()[0]->getWithNewType(
853 Type::getFloatTy(RetTy->getContext()))}},
854 CostKind);
855 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
856 (LT.second == MVT::f16 && MTy == MVT::i64) ||
857 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
858 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
859 return LT.first;
860 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
861 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
862 MTy.getScalarSizeInBits() == 32)
863 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
864 // Extending vector types v8f16->v8i32. These current scalarize but the
865 // codegen could be better.
866 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
867 MTy.getScalarSizeInBits() == 64)
868 return MTy.getVectorNumElements() * 3;
869
870 // If we can we use a legal convert followed by a min+max
871 if ((LT.second.getScalarType() == MVT::f32 ||
872 LT.second.getScalarType() == MVT::f64 ||
873 LT.second.getScalarType() == MVT::f16) &&
874 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
875 Type *LegalTy =
876 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
877 if (LT.second.isVector())
878 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
880 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
881 LegalTy, {LegalTy, LegalTy});
883 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
884 LegalTy, {LegalTy, LegalTy});
886 return LT.first * Cost +
887 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
888 : 1);
889 }
890 // Otherwise we need to follow the default expansion that clamps the value
891 // using a float min/max with a fcmp+sel for nan handling when signed.
892 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
893 RetTy = RetTy->getScalarType();
894 if (LT.second.isVector()) {
895 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
896 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
897 }
898 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
900 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
902 Cost +=
903 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
905 if (IsSigned) {
906 Type *CondTy = RetTy->getWithNewBitWidth(1);
907 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
909 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
911 }
912 return LT.first * Cost;
913 }
914 case Intrinsic::fshl:
915 case Intrinsic::fshr: {
916 if (ICA.getArgs().empty())
917 break;
918
919 // TODO: Add handling for fshl where third argument is not a constant.
920 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
921 if (!OpInfoZ.isConstant())
922 break;
923
924 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
925 if (OpInfoZ.isUniform()) {
926 static const CostTblEntry FshlTbl[] = {
927 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
928 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
929 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
930 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
931 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
932 // to avoid having to duplicate the costs.
933 const auto *Entry =
934 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
935 if (Entry)
936 return LegalisationCost.first * Entry->Cost;
937 }
938
939 auto TyL = getTypeLegalizationCost(RetTy);
940 if (!RetTy->isIntegerTy())
941 break;
942
943 // Estimate cost manually, as types like i8 and i16 will get promoted to
944 // i32 and CostTableLookup will ignore the extra conversion cost.
945 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
946 RetTy->getScalarSizeInBits() < 64) ||
947 (RetTy->getScalarSizeInBits() % 64 != 0);
948 unsigned ExtraCost = HigherCost ? 1 : 0;
949 if (RetTy->getScalarSizeInBits() == 32 ||
950 RetTy->getScalarSizeInBits() == 64)
951 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
952 // extr instruction.
953 else if (HigherCost)
954 ExtraCost = 1;
955 else
956 break;
957 return TyL.first + ExtraCost;
958 }
959 case Intrinsic::get_active_lane_mask: {
960 auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
961 if (RetTy) {
962 EVT RetVT = getTLI()->getValueType(DL, RetTy);
963 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
964 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
965 !getTLI()->isTypeLegal(RetVT)) {
966 // We don't have enough context at this point to determine if the mask
967 // is going to be kept live after the block, which will force the vXi1
968 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
969 // For now, we just assume the vectorizer created this intrinsic and
970 // the result will be the input for a PHI. In this case the cost will
971 // be extremely high for fixed-width vectors.
972 // NOTE: getScalarizationOverhead returns a cost that's far too
973 // pessimistic for the actual generated codegen. In reality there are
974 // two instructions generated per lane.
975 return RetTy->getNumElements() * 2;
976 }
977 }
978 break;
979 }
980 case Intrinsic::experimental_vector_match: {
981 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
982 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
983 unsigned SearchSize = NeedleTy->getNumElements();
984 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
985 // Base cost for MATCH instructions. At least on the Neoverse V2 and
986 // Neoverse V3, these are cheap operations with the same latency as a
987 // vector ADD. In most cases, however, we also need to do an extra DUP.
988 // For fixed-length vectors we currently need an extra five--six
989 // instructions besides the MATCH.
991 if (isa<FixedVectorType>(RetTy))
992 Cost += 10;
993 return Cost;
994 }
995 break;
996 }
997 case Intrinsic::experimental_cttz_elts: {
998 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
999 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1000 // This will consist of a SVE brkb and a cntp instruction. These
1001 // typically have the same latency and half the throughput as a vector
1002 // add instruction.
1003 return 4;
1004 }
1005 break;
1006 }
1007 default:
1008 break;
1009 }
1011}
1012
1013/// The function will remove redundant reinterprets casting in the presence
1014/// of the control flow
1015static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1016 IntrinsicInst &II) {
1018 auto RequiredType = II.getType();
1019
1020 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1021 assert(PN && "Expected Phi Node!");
1022
1023 // Don't create a new Phi unless we can remove the old one.
1024 if (!PN->hasOneUse())
1025 return std::nullopt;
1026
1027 for (Value *IncValPhi : PN->incoming_values()) {
1028 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1029 if (!Reinterpret ||
1030 Reinterpret->getIntrinsicID() !=
1031 Intrinsic::aarch64_sve_convert_to_svbool ||
1032 RequiredType != Reinterpret->getArgOperand(0)->getType())
1033 return std::nullopt;
1034 }
1035
1036 // Create the new Phi
1037 IC.Builder.SetInsertPoint(PN);
1038 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1039 Worklist.push_back(PN);
1040
1041 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1042 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1043 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1044 Worklist.push_back(Reinterpret);
1045 }
1046
1047 // Cleanup Phi Node and reinterprets
1048 return IC.replaceInstUsesWith(II, NPN);
1049}
1050
1051// A collection of properties common to SVE intrinsics that allow for combines
1052// to be written without needing to know the specific intrinsic.
1054 //
1055 // Helper routines for common intrinsic definitions.
1056 //
1057
1058 // e.g. llvm.aarch64.sve.add pg, op1, op2
1059 // with IID ==> llvm.aarch64.sve.add_u
1060 static SVEIntrinsicInfo
1067
1068 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1075
1076 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1082
1083 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1089
1090 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1091 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1092 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1093 return SVEIntrinsicInfo()
1096 }
1097
1098 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1099 // llvm.aarch64.sve.ld1 pg, ptr
1106
1107 // All properties relate to predication and thus having a general predicate
1108 // is the minimum requirement to say there is intrinsic info to act on.
1109 explicit operator bool() const { return hasGoverningPredicate(); }
1110
1111 //
1112 // Properties relating to the governing predicate.
1113 //
1114
1116 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1117 }
1118
1120 assert(hasGoverningPredicate() && "Propery not set!");
1121 return GoverningPredicateIdx;
1122 }
1123
1125 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1126 GoverningPredicateIdx = Index;
1127 return *this;
1128 }
1129
1130 //
1131 // Properties relating to operations the intrinsic could be transformed into.
1132 // NOTE: This does not mean such a transformation is always possible, but the
1133 // knowledge makes it possible to reuse existing optimisations without needing
1134 // to embed specific handling for each intrinsic. For example, instruction
1135 // simplification can be used to optimise an intrinsic's active lanes.
1136 //
1137
1139 return UndefIntrinsic != Intrinsic::not_intrinsic;
1140 }
1141
1143 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1144 return UndefIntrinsic;
1145 }
1146
1148 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1149 UndefIntrinsic = IID;
1150 return *this;
1151 }
1152
1153 bool hasMatchingIROpode() const { return IROpcode != 0; }
1154
1155 unsigned getMatchingIROpode() const {
1156 assert(hasMatchingIROpode() && "Propery not set!");
1157 return IROpcode;
1158 }
1159
1161 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1162 IROpcode = Opcode;
1163 return *this;
1164 }
1165
1166 //
1167 // Properties relating to the result of inactive lanes.
1168 //
1169
1171 return ResultLanes == InactiveLanesTakenFromOperand;
1172 }
1173
1175 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1176 return OperandIdxForInactiveLanes;
1177 }
1178
1180 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1181 ResultLanes = InactiveLanesTakenFromOperand;
1182 OperandIdxForInactiveLanes = Index;
1183 return *this;
1184 }
1185
1187 return ResultLanes == InactiveLanesAreNotDefined;
1188 }
1189
1191 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1192 ResultLanes = InactiveLanesAreNotDefined;
1193 return *this;
1194 }
1195
1197 return ResultLanes == InactiveLanesAreUnused;
1198 }
1199
1201 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1202 ResultLanes = InactiveLanesAreUnused;
1203 return *this;
1204 }
1205
1206 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1207 // inactiveLanesAreZeroed =
1208 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1209 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1210
1212 ResultIsZeroInitialized = true;
1213 return *this;
1214 }
1215
1216 //
1217 // The first operand of unary merging operations is typically only used to
1218 // set the result for inactive lanes. Knowing this allows us to deadcode the
1219 // operand when we can prove there are no inactive lanes.
1220 //
1221
1223 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1224 }
1225
1227 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1228 return OperandIdxWithNoActiveLanes;
1229 }
1230
1232 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1233 OperandIdxWithNoActiveLanes = Index;
1234 return *this;
1235 }
1236
1237private:
1238 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1239
1240 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1241 unsigned IROpcode = 0;
1242
1243 enum PredicationStyle {
1245 InactiveLanesTakenFromOperand,
1246 InactiveLanesAreNotDefined,
1247 InactiveLanesAreUnused
1248 } ResultLanes = Uninitialized;
1249
1250 bool ResultIsZeroInitialized = false;
1251 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1252 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1253};
1254
1256 // Some SVE intrinsics do not use scalable vector types, but since they are
1257 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1258 if (!isa<ScalableVectorType>(II.getType()) &&
1259 all_of(II.args(), [&](const Value *V) {
1260 return !isa<ScalableVectorType>(V->getType());
1261 }))
1262 return SVEIntrinsicInfo();
1263
1264 Intrinsic::ID IID = II.getIntrinsicID();
1265 switch (IID) {
1266 default:
1267 break;
1268 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1269 case Intrinsic::aarch64_sve_fcvt_f16f32:
1270 case Intrinsic::aarch64_sve_fcvt_f16f64:
1271 case Intrinsic::aarch64_sve_fcvt_f32f16:
1272 case Intrinsic::aarch64_sve_fcvt_f32f64:
1273 case Intrinsic::aarch64_sve_fcvt_f64f16:
1274 case Intrinsic::aarch64_sve_fcvt_f64f32:
1275 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1276 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1277 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1278 case Intrinsic::aarch64_sve_fcvtzs:
1279 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1280 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1281 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1282 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1283 case Intrinsic::aarch64_sve_fcvtzu:
1284 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1285 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1286 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1287 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1288 case Intrinsic::aarch64_sve_scvtf:
1289 case Intrinsic::aarch64_sve_scvtf_f16i32:
1290 case Intrinsic::aarch64_sve_scvtf_f16i64:
1291 case Intrinsic::aarch64_sve_scvtf_f32i64:
1292 case Intrinsic::aarch64_sve_scvtf_f64i32:
1293 case Intrinsic::aarch64_sve_ucvtf:
1294 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1295 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1296 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1297 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1299
1300 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1301 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1302 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1303 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1305
1306 case Intrinsic::aarch64_sve_fabd:
1307 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1308 case Intrinsic::aarch64_sve_fadd:
1309 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1310 .setMatchingIROpcode(Instruction::FAdd);
1311 case Intrinsic::aarch64_sve_fdiv:
1312 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1313 .setMatchingIROpcode(Instruction::FDiv);
1314 case Intrinsic::aarch64_sve_fmax:
1315 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1316 case Intrinsic::aarch64_sve_fmaxnm:
1317 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1318 case Intrinsic::aarch64_sve_fmin:
1319 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1320 case Intrinsic::aarch64_sve_fminnm:
1321 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1322 case Intrinsic::aarch64_sve_fmla:
1323 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1324 case Intrinsic::aarch64_sve_fmls:
1325 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1326 case Intrinsic::aarch64_sve_fmul:
1327 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1328 .setMatchingIROpcode(Instruction::FMul);
1329 case Intrinsic::aarch64_sve_fmulx:
1330 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1331 case Intrinsic::aarch64_sve_fnmla:
1332 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1333 case Intrinsic::aarch64_sve_fnmls:
1334 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1335 case Intrinsic::aarch64_sve_fsub:
1336 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1337 .setMatchingIROpcode(Instruction::FSub);
1338 case Intrinsic::aarch64_sve_add:
1339 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1340 .setMatchingIROpcode(Instruction::Add);
1341 case Intrinsic::aarch64_sve_mla:
1342 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1343 case Intrinsic::aarch64_sve_mls:
1344 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1345 case Intrinsic::aarch64_sve_mul:
1346 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1347 .setMatchingIROpcode(Instruction::Mul);
1348 case Intrinsic::aarch64_sve_sabd:
1349 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1350 case Intrinsic::aarch64_sve_sdiv:
1351 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1352 .setMatchingIROpcode(Instruction::SDiv);
1353 case Intrinsic::aarch64_sve_smax:
1354 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1355 case Intrinsic::aarch64_sve_smin:
1356 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1357 case Intrinsic::aarch64_sve_smulh:
1358 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1359 case Intrinsic::aarch64_sve_sub:
1360 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1361 .setMatchingIROpcode(Instruction::Sub);
1362 case Intrinsic::aarch64_sve_uabd:
1363 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1364 case Intrinsic::aarch64_sve_udiv:
1365 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1366 .setMatchingIROpcode(Instruction::UDiv);
1367 case Intrinsic::aarch64_sve_umax:
1368 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1369 case Intrinsic::aarch64_sve_umin:
1370 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1371 case Intrinsic::aarch64_sve_umulh:
1372 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1373 case Intrinsic::aarch64_sve_asr:
1374 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1375 .setMatchingIROpcode(Instruction::AShr);
1376 case Intrinsic::aarch64_sve_lsl:
1377 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1378 .setMatchingIROpcode(Instruction::Shl);
1379 case Intrinsic::aarch64_sve_lsr:
1380 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1381 .setMatchingIROpcode(Instruction::LShr);
1382 case Intrinsic::aarch64_sve_and:
1383 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1384 .setMatchingIROpcode(Instruction::And);
1385 case Intrinsic::aarch64_sve_bic:
1386 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1387 case Intrinsic::aarch64_sve_eor:
1388 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1389 .setMatchingIROpcode(Instruction::Xor);
1390 case Intrinsic::aarch64_sve_orr:
1391 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1392 .setMatchingIROpcode(Instruction::Or);
1393 case Intrinsic::aarch64_sve_sqsub:
1394 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1395 case Intrinsic::aarch64_sve_uqsub:
1396 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1397
1398 case Intrinsic::aarch64_sve_add_u:
1400 Instruction::Add);
1401 case Intrinsic::aarch64_sve_and_u:
1403 Instruction::And);
1404 case Intrinsic::aarch64_sve_asr_u:
1406 Instruction::AShr);
1407 case Intrinsic::aarch64_sve_eor_u:
1409 Instruction::Xor);
1410 case Intrinsic::aarch64_sve_fadd_u:
1412 Instruction::FAdd);
1413 case Intrinsic::aarch64_sve_fdiv_u:
1415 Instruction::FDiv);
1416 case Intrinsic::aarch64_sve_fmul_u:
1418 Instruction::FMul);
1419 case Intrinsic::aarch64_sve_fsub_u:
1421 Instruction::FSub);
1422 case Intrinsic::aarch64_sve_lsl_u:
1424 Instruction::Shl);
1425 case Intrinsic::aarch64_sve_lsr_u:
1427 Instruction::LShr);
1428 case Intrinsic::aarch64_sve_mul_u:
1430 Instruction::Mul);
1431 case Intrinsic::aarch64_sve_orr_u:
1433 Instruction::Or);
1434 case Intrinsic::aarch64_sve_sdiv_u:
1436 Instruction::SDiv);
1437 case Intrinsic::aarch64_sve_sub_u:
1439 Instruction::Sub);
1440 case Intrinsic::aarch64_sve_udiv_u:
1442 Instruction::UDiv);
1443
1444 case Intrinsic::aarch64_sve_addqv:
1445 case Intrinsic::aarch64_sve_and_z:
1446 case Intrinsic::aarch64_sve_bic_z:
1447 case Intrinsic::aarch64_sve_brka_z:
1448 case Intrinsic::aarch64_sve_brkb_z:
1449 case Intrinsic::aarch64_sve_brkn_z:
1450 case Intrinsic::aarch64_sve_brkpa_z:
1451 case Intrinsic::aarch64_sve_brkpb_z:
1452 case Intrinsic::aarch64_sve_cntp:
1453 case Intrinsic::aarch64_sve_compact:
1454 case Intrinsic::aarch64_sve_eor_z:
1455 case Intrinsic::aarch64_sve_eorv:
1456 case Intrinsic::aarch64_sve_eorqv:
1457 case Intrinsic::aarch64_sve_nand_z:
1458 case Intrinsic::aarch64_sve_nor_z:
1459 case Intrinsic::aarch64_sve_orn_z:
1460 case Intrinsic::aarch64_sve_orr_z:
1461 case Intrinsic::aarch64_sve_orv:
1462 case Intrinsic::aarch64_sve_orqv:
1463 case Intrinsic::aarch64_sve_pnext:
1464 case Intrinsic::aarch64_sve_rdffr_z:
1465 case Intrinsic::aarch64_sve_saddv:
1466 case Intrinsic::aarch64_sve_uaddv:
1467 case Intrinsic::aarch64_sve_umaxv:
1468 case Intrinsic::aarch64_sve_umaxqv:
1469 case Intrinsic::aarch64_sve_cmpeq:
1470 case Intrinsic::aarch64_sve_cmpeq_wide:
1471 case Intrinsic::aarch64_sve_cmpge:
1472 case Intrinsic::aarch64_sve_cmpge_wide:
1473 case Intrinsic::aarch64_sve_cmpgt:
1474 case Intrinsic::aarch64_sve_cmpgt_wide:
1475 case Intrinsic::aarch64_sve_cmphi:
1476 case Intrinsic::aarch64_sve_cmphi_wide:
1477 case Intrinsic::aarch64_sve_cmphs:
1478 case Intrinsic::aarch64_sve_cmphs_wide:
1479 case Intrinsic::aarch64_sve_cmple_wide:
1480 case Intrinsic::aarch64_sve_cmplo_wide:
1481 case Intrinsic::aarch64_sve_cmpls_wide:
1482 case Intrinsic::aarch64_sve_cmplt_wide:
1483 case Intrinsic::aarch64_sve_cmpne:
1484 case Intrinsic::aarch64_sve_cmpne_wide:
1485 case Intrinsic::aarch64_sve_facge:
1486 case Intrinsic::aarch64_sve_facgt:
1487 case Intrinsic::aarch64_sve_fcmpeq:
1488 case Intrinsic::aarch64_sve_fcmpge:
1489 case Intrinsic::aarch64_sve_fcmpgt:
1490 case Intrinsic::aarch64_sve_fcmpne:
1491 case Intrinsic::aarch64_sve_fcmpuo:
1492 case Intrinsic::aarch64_sve_ld1:
1493 case Intrinsic::aarch64_sve_ld1_gather:
1494 case Intrinsic::aarch64_sve_ld1_gather_index:
1495 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1496 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1497 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1498 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1499 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1500 case Intrinsic::aarch64_sve_ld1q_gather_index:
1501 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1502 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1503 case Intrinsic::aarch64_sve_ld1ro:
1504 case Intrinsic::aarch64_sve_ld1rq:
1505 case Intrinsic::aarch64_sve_ld1udq:
1506 case Intrinsic::aarch64_sve_ld1uwq:
1507 case Intrinsic::aarch64_sve_ld2_sret:
1508 case Intrinsic::aarch64_sve_ld2q_sret:
1509 case Intrinsic::aarch64_sve_ld3_sret:
1510 case Intrinsic::aarch64_sve_ld3q_sret:
1511 case Intrinsic::aarch64_sve_ld4_sret:
1512 case Intrinsic::aarch64_sve_ld4q_sret:
1513 case Intrinsic::aarch64_sve_ldff1:
1514 case Intrinsic::aarch64_sve_ldff1_gather:
1515 case Intrinsic::aarch64_sve_ldff1_gather_index:
1516 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1517 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1518 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1519 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1520 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1521 case Intrinsic::aarch64_sve_ldnf1:
1522 case Intrinsic::aarch64_sve_ldnt1:
1523 case Intrinsic::aarch64_sve_ldnt1_gather:
1524 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1525 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1526 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1528
1529 case Intrinsic::aarch64_sve_prf:
1530 case Intrinsic::aarch64_sve_prfb_gather_index:
1531 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1532 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1533 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1534 case Intrinsic::aarch64_sve_prfd_gather_index:
1535 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1536 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1537 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1538 case Intrinsic::aarch64_sve_prfh_gather_index:
1539 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1540 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1541 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1542 case Intrinsic::aarch64_sve_prfw_gather_index:
1543 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1544 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1545 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1547
1548 case Intrinsic::aarch64_sve_st1_scatter:
1549 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1550 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1551 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1552 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1553 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1554 case Intrinsic::aarch64_sve_st1dq:
1555 case Intrinsic::aarch64_sve_st1q_scatter_index:
1556 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1557 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1558 case Intrinsic::aarch64_sve_st1wq:
1559 case Intrinsic::aarch64_sve_stnt1:
1560 case Intrinsic::aarch64_sve_stnt1_scatter:
1561 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1562 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1563 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1565 case Intrinsic::aarch64_sve_st2:
1566 case Intrinsic::aarch64_sve_st2q:
1568 case Intrinsic::aarch64_sve_st3:
1569 case Intrinsic::aarch64_sve_st3q:
1571 case Intrinsic::aarch64_sve_st4:
1572 case Intrinsic::aarch64_sve_st4q:
1574 }
1575
1576 return SVEIntrinsicInfo();
1577}
1578
1579static bool isAllActivePredicate(Value *Pred) {
1580 // Look through convert.from.svbool(convert.to.svbool(...) chain.
1581 Value *UncastedPred;
1584 m_Value(UncastedPred)))))
1585 // If the predicate has the same or less lanes than the uncasted
1586 // predicate then we know the casting has no effect.
1587 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1588 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1589 Pred = UncastedPred;
1590 auto *C = dyn_cast<Constant>(Pred);
1591 return (C && C->isAllOnesValue());
1592}
1593
1594// Simplify `V` by only considering the operations that affect active lanes.
1595// This function should only return existing Values or newly created Constants.
1596static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1597 auto *Dup = dyn_cast<IntrinsicInst>(V);
1598 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1599 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1601 cast<VectorType>(V->getType())->getElementCount(),
1602 cast<Constant>(Dup->getOperand(2)));
1603
1604 return V;
1605}
1606
1607static std::optional<Instruction *>
1609 const SVEIntrinsicInfo &IInfo) {
1610 const unsigned Opc = IInfo.getMatchingIROpode();
1611 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1612
1613 Value *Pg = II.getOperand(0);
1614 Value *Op1 = II.getOperand(1);
1615 Value *Op2 = II.getOperand(2);
1616 const DataLayout &DL = II.getDataLayout();
1617
1618 // Canonicalise constants to the RHS.
1620 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1621 IC.replaceOperand(II, 1, Op2);
1622 IC.replaceOperand(II, 2, Op1);
1623 return &II;
1624 }
1625
1626 // Only active lanes matter when simplifying the operation.
1627 Op1 = stripInactiveLanes(Op1, Pg);
1628 Op2 = stripInactiveLanes(Op2, Pg);
1629
1630 Value *SimpleII;
1631 if (auto FII = dyn_cast<FPMathOperator>(&II))
1632 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1633 else
1634 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1635
1636 // An SVE intrinsic's result is always defined. However, this is not the case
1637 // for its equivalent IR instruction (e.g. when shifting by an amount more
1638 // than the data's bitwidth). Simplifications to an undefined result must be
1639 // ignored to preserve the intrinsic's expected behaviour.
1640 if (!SimpleII || isa<UndefValue>(SimpleII))
1641 return std::nullopt;
1642
1643 if (IInfo.inactiveLanesAreNotDefined())
1644 return IC.replaceInstUsesWith(II, SimpleII);
1645
1646 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1647
1648 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1649 if (SimpleII == Inactive)
1650 return IC.replaceInstUsesWith(II, SimpleII);
1651
1652 // Inactive lanes must be preserved.
1653 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1654 return IC.replaceInstUsesWith(II, SimpleII);
1655}
1656
1657// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1658// to operations with less strict inactive lane requirements.
1659static std::optional<Instruction *>
1661 const SVEIntrinsicInfo &IInfo) {
1662 if (!IInfo.hasGoverningPredicate())
1663 return std::nullopt;
1664
1665 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1666
1667 // If there are no active lanes.
1668 if (match(OpPredicate, m_ZeroInt())) {
1670 return IC.replaceInstUsesWith(
1671 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1672
1673 if (IInfo.inactiveLanesAreUnused()) {
1674 if (IInfo.resultIsZeroInitialized())
1676
1677 return IC.eraseInstFromFunction(II);
1678 }
1679 }
1680
1681 // If there are no inactive lanes.
1682 if (isAllActivePredicate(OpPredicate)) {
1683 if (IInfo.hasOperandWithNoActiveLanes()) {
1684 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1685 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1686 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1687 }
1688
1689 if (IInfo.hasMatchingUndefIntrinsic()) {
1690 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1691 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1692 II.setCalledFunction(NewDecl);
1693 return &II;
1694 }
1695 }
1696
1697 // Operation specific simplifications.
1698 if (IInfo.hasMatchingIROpode() &&
1700 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1701
1702 return std::nullopt;
1703}
1704
1705// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1706// => (binop (pred) (from_svbool _) (from_svbool _))
1707//
1708// The above transformation eliminates a `to_svbool` in the predicate
1709// operand of bitwise operation `binop` by narrowing the vector width of
1710// the operation. For example, it would convert a `<vscale x 16 x i1>
1711// and` into a `<vscale x 4 x i1> and`. This is profitable because
1712// to_svbool must zero the new lanes during widening, whereas
1713// from_svbool is free.
1714static std::optional<Instruction *>
1716 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1717 if (!BinOp)
1718 return std::nullopt;
1719
1720 auto IntrinsicID = BinOp->getIntrinsicID();
1721 switch (IntrinsicID) {
1722 case Intrinsic::aarch64_sve_and_z:
1723 case Intrinsic::aarch64_sve_bic_z:
1724 case Intrinsic::aarch64_sve_eor_z:
1725 case Intrinsic::aarch64_sve_nand_z:
1726 case Intrinsic::aarch64_sve_nor_z:
1727 case Intrinsic::aarch64_sve_orn_z:
1728 case Intrinsic::aarch64_sve_orr_z:
1729 break;
1730 default:
1731 return std::nullopt;
1732 }
1733
1734 auto BinOpPred = BinOp->getOperand(0);
1735 auto BinOpOp1 = BinOp->getOperand(1);
1736 auto BinOpOp2 = BinOp->getOperand(2);
1737
1738 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1739 if (!PredIntr ||
1740 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1741 return std::nullopt;
1742
1743 auto PredOp = PredIntr->getOperand(0);
1744 auto PredOpTy = cast<VectorType>(PredOp->getType());
1745 if (PredOpTy != II.getType())
1746 return std::nullopt;
1747
1748 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1749 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1750 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1751 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1752 if (BinOpOp1 == BinOpOp2)
1753 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1754 else
1755 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1756 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1757
1758 auto NarrowedBinOp =
1759 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1760 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1761}
1762
1763static std::optional<Instruction *>
1765 // If the reinterpret instruction operand is a PHI Node
1766 if (isa<PHINode>(II.getArgOperand(0)))
1767 return processPhiNode(IC, II);
1768
1769 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1770 return BinOpCombine;
1771
1772 // Ignore converts to/from svcount_t.
1773 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1774 isa<TargetExtType>(II.getType()))
1775 return std::nullopt;
1776
1777 SmallVector<Instruction *, 32> CandidatesForRemoval;
1778 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1779
1780 const auto *IVTy = cast<VectorType>(II.getType());
1781
1782 // Walk the chain of conversions.
1783 while (Cursor) {
1784 // If the type of the cursor has fewer lanes than the final result, zeroing
1785 // must take place, which breaks the equivalence chain.
1786 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1787 if (CursorVTy->getElementCount().getKnownMinValue() <
1788 IVTy->getElementCount().getKnownMinValue())
1789 break;
1790
1791 // If the cursor has the same type as I, it is a viable replacement.
1792 if (Cursor->getType() == IVTy)
1793 EarliestReplacement = Cursor;
1794
1795 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1796
1797 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1798 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1799 Intrinsic::aarch64_sve_convert_to_svbool ||
1800 IntrinsicCursor->getIntrinsicID() ==
1801 Intrinsic::aarch64_sve_convert_from_svbool))
1802 break;
1803
1804 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1805 Cursor = IntrinsicCursor->getOperand(0);
1806 }
1807
1808 // If no viable replacement in the conversion chain was found, there is
1809 // nothing to do.
1810 if (!EarliestReplacement)
1811 return std::nullopt;
1812
1813 return IC.replaceInstUsesWith(II, EarliestReplacement);
1814}
1815
1816static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1817 IntrinsicInst &II) {
1818 // svsel(ptrue, x, y) => x
1819 auto *OpPredicate = II.getOperand(0);
1820 if (isAllActivePredicate(OpPredicate))
1821 return IC.replaceInstUsesWith(II, II.getOperand(1));
1822
1823 auto Select =
1824 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1825 return IC.replaceInstUsesWith(II, Select);
1826}
1827
1828static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1829 IntrinsicInst &II) {
1830 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1831 if (!Pg)
1832 return std::nullopt;
1833
1834 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1835 return std::nullopt;
1836
1837 const auto PTruePattern =
1838 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1839 if (PTruePattern != AArch64SVEPredPattern::vl1)
1840 return std::nullopt;
1841
1842 // The intrinsic is inserting into lane zero so use an insert instead.
1843 auto *IdxTy = Type::getInt64Ty(II.getContext());
1844 auto *Insert = InsertElementInst::Create(
1845 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1846 Insert->insertBefore(II.getIterator());
1847 Insert->takeName(&II);
1848
1849 return IC.replaceInstUsesWith(II, Insert);
1850}
1851
1852static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1853 IntrinsicInst &II) {
1854 // Replace DupX with a regular IR splat.
1855 auto *RetTy = cast<ScalableVectorType>(II.getType());
1856 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1857 II.getArgOperand(0));
1858 Splat->takeName(&II);
1859 return IC.replaceInstUsesWith(II, Splat);
1860}
1861
1862static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1863 IntrinsicInst &II) {
1864 LLVMContext &Ctx = II.getContext();
1865
1866 if (!isAllActivePredicate(II.getArgOperand(0)))
1867 return std::nullopt;
1868
1869 // Check that we have a compare of zero..
1870 auto *SplatValue =
1872 if (!SplatValue || !SplatValue->isZero())
1873 return std::nullopt;
1874
1875 // ..against a dupq
1876 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1877 if (!DupQLane ||
1878 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1879 return std::nullopt;
1880
1881 // Where the dupq is a lane 0 replicate of a vector insert
1882 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1883 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1884 return std::nullopt;
1885
1886 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1887 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1888 return std::nullopt;
1889
1890 // Where the vector insert is a fixed constant vector insert into undef at
1891 // index zero
1892 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1893 return std::nullopt;
1894
1895 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1896 return std::nullopt;
1897
1898 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1899 if (!ConstVec)
1900 return std::nullopt;
1901
1902 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1903 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1904 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1905 return std::nullopt;
1906
1907 unsigned NumElts = VecTy->getNumElements();
1908 unsigned PredicateBits = 0;
1909
1910 // Expand intrinsic operands to a 16-bit byte level predicate
1911 for (unsigned I = 0; I < NumElts; ++I) {
1912 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1913 if (!Arg)
1914 return std::nullopt;
1915 if (!Arg->isZero())
1916 PredicateBits |= 1 << (I * (16 / NumElts));
1917 }
1918
1919 // If all bits are zero bail early with an empty predicate
1920 if (PredicateBits == 0) {
1921 auto *PFalse = Constant::getNullValue(II.getType());
1922 PFalse->takeName(&II);
1923 return IC.replaceInstUsesWith(II, PFalse);
1924 }
1925
1926 // Calculate largest predicate type used (where byte predicate is largest)
1927 unsigned Mask = 8;
1928 for (unsigned I = 0; I < 16; ++I)
1929 if ((PredicateBits & (1 << I)) != 0)
1930 Mask |= (I % 8);
1931
1932 unsigned PredSize = Mask & -Mask;
1933 auto *PredType = ScalableVectorType::get(
1934 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1935
1936 // Ensure all relevant bits are set
1937 for (unsigned I = 0; I < 16; I += PredSize)
1938 if ((PredicateBits & (1 << I)) == 0)
1939 return std::nullopt;
1940
1941 auto *PTruePat =
1942 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1943 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1944 {PredType}, {PTruePat});
1945 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1946 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1947 auto *ConvertFromSVBool =
1948 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1949 {II.getType()}, {ConvertToSVBool});
1950
1951 ConvertFromSVBool->takeName(&II);
1952 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1953}
1954
1955static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1956 IntrinsicInst &II) {
1957 Value *Pg = II.getArgOperand(0);
1958 Value *Vec = II.getArgOperand(1);
1959 auto IntrinsicID = II.getIntrinsicID();
1960 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1961
1962 // lastX(splat(X)) --> X
1963 if (auto *SplatVal = getSplatValue(Vec))
1964 return IC.replaceInstUsesWith(II, SplatVal);
1965
1966 // If x and/or y is a splat value then:
1967 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1968 Value *LHS, *RHS;
1969 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1970 if (isSplatValue(LHS) || isSplatValue(RHS)) {
1971 auto *OldBinOp = cast<BinaryOperator>(Vec);
1972 auto OpC = OldBinOp->getOpcode();
1973 auto *NewLHS =
1974 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1975 auto *NewRHS =
1976 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1978 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
1979 return IC.replaceInstUsesWith(II, NewBinOp);
1980 }
1981 }
1982
1983 auto *C = dyn_cast<Constant>(Pg);
1984 if (IsAfter && C && C->isNullValue()) {
1985 // The intrinsic is extracting lane 0 so use an extract instead.
1986 auto *IdxTy = Type::getInt64Ty(II.getContext());
1987 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
1988 Extract->insertBefore(II.getIterator());
1989 Extract->takeName(&II);
1990 return IC.replaceInstUsesWith(II, Extract);
1991 }
1992
1993 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1994 if (!IntrPG)
1995 return std::nullopt;
1996
1997 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1998 return std::nullopt;
1999
2000 const auto PTruePattern =
2001 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2002
2003 // Can the intrinsic's predicate be converted to a known constant index?
2004 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2005 if (!MinNumElts)
2006 return std::nullopt;
2007
2008 unsigned Idx = MinNumElts - 1;
2009 // Increment the index if extracting the element after the last active
2010 // predicate element.
2011 if (IsAfter)
2012 ++Idx;
2013
2014 // Ignore extracts whose index is larger than the known minimum vector
2015 // length. NOTE: This is an artificial constraint where we prefer to
2016 // maintain what the user asked for until an alternative is proven faster.
2017 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2018 if (Idx >= PgVTy->getMinNumElements())
2019 return std::nullopt;
2020
2021 // The intrinsic is extracting a fixed lane so use an extract instead.
2022 auto *IdxTy = Type::getInt64Ty(II.getContext());
2023 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2024 Extract->insertBefore(II.getIterator());
2025 Extract->takeName(&II);
2026 return IC.replaceInstUsesWith(II, Extract);
2027}
2028
2029static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2030 IntrinsicInst &II) {
2031 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2032 // integer variant across a variety of micro-architectures. Replace scalar
2033 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2034 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2035 // depending on the micro-architecture, but has been observed as generally
2036 // being faster, particularly when the CLAST[AB] op is a loop-carried
2037 // dependency.
2038 Value *Pg = II.getArgOperand(0);
2039 Value *Fallback = II.getArgOperand(1);
2040 Value *Vec = II.getArgOperand(2);
2041 Type *Ty = II.getType();
2042
2043 if (!Ty->isIntegerTy())
2044 return std::nullopt;
2045
2046 Type *FPTy;
2047 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2048 default:
2049 return std::nullopt;
2050 case 16:
2051 FPTy = IC.Builder.getHalfTy();
2052 break;
2053 case 32:
2054 FPTy = IC.Builder.getFloatTy();
2055 break;
2056 case 64:
2057 FPTy = IC.Builder.getDoubleTy();
2058 break;
2059 }
2060
2061 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2062 auto *FPVTy = VectorType::get(
2063 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2064 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2065 auto *FPII = IC.Builder.CreateIntrinsic(
2066 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2067 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2068 return IC.replaceInstUsesWith(II, FPIItoInt);
2069}
2070
2071static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2072 IntrinsicInst &II) {
2073 LLVMContext &Ctx = II.getContext();
2074 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2075 // can work with RDFFR_PP for ptest elimination.
2076 auto *AllPat =
2077 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2078 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2079 {II.getType()}, {AllPat});
2080 auto *RDFFR =
2081 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});
2082 RDFFR->takeName(&II);
2083 return IC.replaceInstUsesWith(II, RDFFR);
2084}
2085
2086static std::optional<Instruction *>
2088 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2089
2090 if (Pattern == AArch64SVEPredPattern::all) {
2092 II.getType(), ElementCount::getScalable(NumElts));
2093 Cnt->takeName(&II);
2094 return IC.replaceInstUsesWith(II, Cnt);
2095 }
2096
2097 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2098
2099 return MinNumElts && NumElts >= MinNumElts
2100 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2101 II, ConstantInt::get(II.getType(), MinNumElts)))
2102 : std::nullopt;
2103}
2104
2105static std::optional<Instruction *>
2107 const AArch64Subtarget *ST) {
2108 if (!ST->isStreaming())
2109 return std::nullopt;
2110
2111 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2112 // with SVEPredPattern::all
2113 Value *Cnt =
2115 Cnt->takeName(&II);
2116 return IC.replaceInstUsesWith(II, Cnt);
2117}
2118
2119static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2120 IntrinsicInst &II) {
2121 Value *PgVal = II.getArgOperand(0);
2122 Value *OpVal = II.getArgOperand(1);
2123
2124 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2125 // Later optimizations prefer this form.
2126 if (PgVal == OpVal &&
2127 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2128 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2129 Value *Ops[] = {PgVal, OpVal};
2130 Type *Tys[] = {PgVal->getType()};
2131
2132 auto *PTest =
2133 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2134 PTest->takeName(&II);
2135
2136 return IC.replaceInstUsesWith(II, PTest);
2137 }
2138
2141
2142 if (!Pg || !Op)
2143 return std::nullopt;
2144
2145 Intrinsic::ID OpIID = Op->getIntrinsicID();
2146
2147 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2148 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2149 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2150 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2151 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2152
2153 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2154
2155 PTest->takeName(&II);
2156 return IC.replaceInstUsesWith(II, PTest);
2157 }
2158
2159 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2160 // Later optimizations may rewrite sequence to use the flag-setting variant
2161 // of instruction X to remove PTEST.
2162 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2163 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2164 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2165 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2166 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2167 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2168 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2169 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2170 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2171 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2172 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2173 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2174 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2175 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2176 Type *Tys[] = {Pg->getType()};
2177
2178 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2179 PTest->takeName(&II);
2180
2181 return IC.replaceInstUsesWith(II, PTest);
2182 }
2183
2184 return std::nullopt;
2185}
2186
2187template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
2188static std::optional<Instruction *>
2190 bool MergeIntoAddendOp) {
2191 Value *P = II.getOperand(0);
2192 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2193 if (MergeIntoAddendOp) {
2194 AddendOp = II.getOperand(1);
2195 Mul = II.getOperand(2);
2196 } else {
2197 AddendOp = II.getOperand(2);
2198 Mul = II.getOperand(1);
2199 }
2200
2202 m_Value(MulOp1))))
2203 return std::nullopt;
2204
2205 if (!Mul->hasOneUse())
2206 return std::nullopt;
2207
2208 Instruction *FMFSource = nullptr;
2209 if (II.getType()->isFPOrFPVectorTy()) {
2210 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2211 // Stop the combine when the flags on the inputs differ in case dropping
2212 // flags would lead to us missing out on more beneficial optimizations.
2213 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2214 return std::nullopt;
2215 if (!FAddFlags.allowContract())
2216 return std::nullopt;
2217 FMFSource = &II;
2218 }
2219
2220 CallInst *Res;
2221 if (MergeIntoAddendOp)
2222 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2223 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2224 else
2225 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2226 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2227
2228 return IC.replaceInstUsesWith(II, Res);
2229}
2230
2231static std::optional<Instruction *>
2233 Value *Pred = II.getOperand(0);
2234 Value *PtrOp = II.getOperand(1);
2235 Type *VecTy = II.getType();
2236
2237 if (isAllActivePredicate(Pred)) {
2238 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2239 Load->copyMetadata(II);
2240 return IC.replaceInstUsesWith(II, Load);
2241 }
2242
2243 CallInst *MaskedLoad =
2244 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2245 Pred, ConstantAggregateZero::get(VecTy));
2246 MaskedLoad->copyMetadata(II);
2247 return IC.replaceInstUsesWith(II, MaskedLoad);
2248}
2249
2250static std::optional<Instruction *>
2252 Value *VecOp = II.getOperand(0);
2253 Value *Pred = II.getOperand(1);
2254 Value *PtrOp = II.getOperand(2);
2255
2256 if (isAllActivePredicate(Pred)) {
2257 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2258 Store->copyMetadata(II);
2259 return IC.eraseInstFromFunction(II);
2260 }
2261
2262 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2263 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2264 MaskedStore->copyMetadata(II);
2265 return IC.eraseInstFromFunction(II);
2266}
2267
2269 switch (Intrinsic) {
2270 case Intrinsic::aarch64_sve_fmul_u:
2271 return Instruction::BinaryOps::FMul;
2272 case Intrinsic::aarch64_sve_fadd_u:
2273 return Instruction::BinaryOps::FAdd;
2274 case Intrinsic::aarch64_sve_fsub_u:
2275 return Instruction::BinaryOps::FSub;
2276 default:
2277 return Instruction::BinaryOpsEnd;
2278 }
2279}
2280
2281static std::optional<Instruction *>
2283 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2284 if (II.isStrictFP())
2285 return std::nullopt;
2286
2287 auto *OpPredicate = II.getOperand(0);
2288 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2289 if (BinOpCode == Instruction::BinaryOpsEnd ||
2290 !isAllActivePredicate(OpPredicate))
2291 return std::nullopt;
2292 auto BinOp = IC.Builder.CreateBinOpFMF(
2293 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2294 return IC.replaceInstUsesWith(II, BinOp);
2295}
2296
2297static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2298 IntrinsicInst &II) {
2299 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2300 Intrinsic::aarch64_sve_mla>(
2301 IC, II, true))
2302 return MLA;
2303 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2304 Intrinsic::aarch64_sve_mad>(
2305 IC, II, false))
2306 return MAD;
2307 return std::nullopt;
2308}
2309
2310static std::optional<Instruction *>
2312 if (auto FMLA =
2313 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2314 Intrinsic::aarch64_sve_fmla>(IC, II,
2315 true))
2316 return FMLA;
2317 if (auto FMAD =
2318 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2319 Intrinsic::aarch64_sve_fmad>(IC, II,
2320 false))
2321 return FMAD;
2322 if (auto FMLA =
2323 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2324 Intrinsic::aarch64_sve_fmla>(IC, II,
2325 true))
2326 return FMLA;
2327 return std::nullopt;
2328}
2329
2330static std::optional<Instruction *>
2332 if (auto FMLA =
2333 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2334 Intrinsic::aarch64_sve_fmla>(IC, II,
2335 true))
2336 return FMLA;
2337 if (auto FMAD =
2338 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2339 Intrinsic::aarch64_sve_fmad>(IC, II,
2340 false))
2341 return FMAD;
2342 if (auto FMLA_U =
2343 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2344 Intrinsic::aarch64_sve_fmla_u>(
2345 IC, II, true))
2346 return FMLA_U;
2347 return instCombineSVEVectorBinOp(IC, II);
2348}
2349
2350static std::optional<Instruction *>
2352 if (auto FMLS =
2353 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2354 Intrinsic::aarch64_sve_fmls>(IC, II,
2355 true))
2356 return FMLS;
2357 if (auto FMSB =
2358 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2359 Intrinsic::aarch64_sve_fnmsb>(
2360 IC, II, false))
2361 return FMSB;
2362 if (auto FMLS =
2363 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2364 Intrinsic::aarch64_sve_fmls>(IC, II,
2365 true))
2366 return FMLS;
2367 return std::nullopt;
2368}
2369
2370static std::optional<Instruction *>
2372 if (auto FMLS =
2373 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2374 Intrinsic::aarch64_sve_fmls>(IC, II,
2375 true))
2376 return FMLS;
2377 if (auto FMSB =
2378 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2379 Intrinsic::aarch64_sve_fnmsb>(
2380 IC, II, false))
2381 return FMSB;
2382 if (auto FMLS_U =
2383 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2384 Intrinsic::aarch64_sve_fmls_u>(
2385 IC, II, true))
2386 return FMLS_U;
2387 return instCombineSVEVectorBinOp(IC, II);
2388}
2389
2390static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2391 IntrinsicInst &II) {
2392 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2393 Intrinsic::aarch64_sve_mls>(
2394 IC, II, true))
2395 return MLS;
2396 return std::nullopt;
2397}
2398
2399static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2400 IntrinsicInst &II) {
2401 Value *UnpackArg = II.getArgOperand(0);
2402 auto *RetTy = cast<ScalableVectorType>(II.getType());
2403 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2404 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2405
2406 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2407 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2408 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2409 ScalarArg =
2410 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2411 Value *NewVal =
2412 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2413 NewVal->takeName(&II);
2414 return IC.replaceInstUsesWith(II, NewVal);
2415 }
2416
2417 return std::nullopt;
2418}
2419static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2420 IntrinsicInst &II) {
2421 auto *OpVal = II.getOperand(0);
2422 auto *OpIndices = II.getOperand(1);
2423 VectorType *VTy = cast<VectorType>(II.getType());
2424
2425 // Check whether OpIndices is a constant splat value < minimal element count
2426 // of result.
2427 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2428 if (!SplatValue ||
2429 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2430 return std::nullopt;
2431
2432 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2433 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2434 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2435 auto *VectorSplat =
2436 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2437
2438 VectorSplat->takeName(&II);
2439 return IC.replaceInstUsesWith(II, VectorSplat);
2440}
2441
2442static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2443 IntrinsicInst &II) {
2444 Value *A, *B;
2445 Type *RetTy = II.getType();
2446 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2447 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2448
2449 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2450 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2451 if ((match(II.getArgOperand(0),
2453 match(II.getArgOperand(1),
2455 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2456 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2457 auto *TyA = cast<ScalableVectorType>(A->getType());
2458 if (TyA == B->getType() &&
2460 auto *SubVec = IC.Builder.CreateInsertVector(
2461 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2462 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2463 TyA->getMinNumElements());
2464 ConcatVec->takeName(&II);
2465 return IC.replaceInstUsesWith(II, ConcatVec);
2466 }
2467 }
2468
2469 return std::nullopt;
2470}
2471
2472static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2473 IntrinsicInst &II) {
2474 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2475 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2476 Value *A, *B;
2477 if (match(II.getArgOperand(0),
2480 m_Specific(A), m_Specific(B))))
2481 return IC.replaceInstUsesWith(
2482 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2483
2484 return std::nullopt;
2485}
2486
2487static std::optional<Instruction *>
2489 Value *Mask = II.getOperand(0);
2490 Value *BasePtr = II.getOperand(1);
2491 Value *Index = II.getOperand(2);
2492 Type *Ty = II.getType();
2493 Value *PassThru = ConstantAggregateZero::get(Ty);
2494
2495 // Contiguous gather => masked load.
2496 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2497 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2498 Value *IndexBase;
2500 m_Value(IndexBase), m_SpecificInt(1)))) {
2501 Align Alignment =
2502 BasePtr->getPointerAlignment(II.getDataLayout());
2503
2504 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2505 BasePtr, IndexBase);
2506 CallInst *MaskedLoad =
2507 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2508 MaskedLoad->takeName(&II);
2509 return IC.replaceInstUsesWith(II, MaskedLoad);
2510 }
2511
2512 return std::nullopt;
2513}
2514
2515static std::optional<Instruction *>
2517 Value *Val = II.getOperand(0);
2518 Value *Mask = II.getOperand(1);
2519 Value *BasePtr = II.getOperand(2);
2520 Value *Index = II.getOperand(3);
2521 Type *Ty = Val->getType();
2522
2523 // Contiguous scatter => masked store.
2524 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2525 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2526 Value *IndexBase;
2528 m_Value(IndexBase), m_SpecificInt(1)))) {
2529 Align Alignment =
2530 BasePtr->getPointerAlignment(II.getDataLayout());
2531
2532 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2533 BasePtr, IndexBase);
2534 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2535
2536 return IC.eraseInstFromFunction(II);
2537 }
2538
2539 return std::nullopt;
2540}
2541
2542static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2543 IntrinsicInst &II) {
2545 Value *Pred = II.getOperand(0);
2546 Value *Vec = II.getOperand(1);
2547 Value *DivVec = II.getOperand(2);
2548
2549 Value *SplatValue = getSplatValue(DivVec);
2550 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2551 if (!SplatConstantInt)
2552 return std::nullopt;
2553
2554 APInt Divisor = SplatConstantInt->getValue();
2555 const int64_t DivisorValue = Divisor.getSExtValue();
2556 if (DivisorValue == -1)
2557 return std::nullopt;
2558 if (DivisorValue == 1)
2559 IC.replaceInstUsesWith(II, Vec);
2560
2561 if (Divisor.isPowerOf2()) {
2562 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2563 auto ASRD = IC.Builder.CreateIntrinsic(
2564 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2565 return IC.replaceInstUsesWith(II, ASRD);
2566 }
2567 if (Divisor.isNegatedPowerOf2()) {
2568 Divisor.negate();
2569 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2570 auto ASRD = IC.Builder.CreateIntrinsic(
2571 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2572 auto NEG = IC.Builder.CreateIntrinsic(
2573 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2574 return IC.replaceInstUsesWith(II, NEG);
2575 }
2576
2577 return std::nullopt;
2578}
2579
2580bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2581 size_t VecSize = Vec.size();
2582 if (VecSize == 1)
2583 return true;
2584 if (!isPowerOf2_64(VecSize))
2585 return false;
2586 size_t HalfVecSize = VecSize / 2;
2587
2588 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2589 RHS != Vec.end(); LHS++, RHS++) {
2590 if (*LHS != nullptr && *RHS != nullptr) {
2591 if (*LHS == *RHS)
2592 continue;
2593 else
2594 return false;
2595 }
2596 if (!AllowPoison)
2597 return false;
2598 if (*LHS == nullptr && *RHS != nullptr)
2599 *LHS = *RHS;
2600 }
2601
2602 Vec.resize(HalfVecSize);
2603 SimplifyValuePattern(Vec, AllowPoison);
2604 return true;
2605}
2606
2607// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2608// to dupqlane(f64(C)) where C is A concatenated with B
2609static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2610 IntrinsicInst &II) {
2611 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2612 if (!match(II.getOperand(0),
2614 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2615 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2616 return std::nullopt;
2617 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2618
2619 // Insert the scalars into a container ordered by InsertElement index
2620 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2621 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2622 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2623 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2624 CurrentInsertElt = InsertElt->getOperand(0);
2625 }
2626
2627 bool AllowPoison =
2628 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2629 if (!SimplifyValuePattern(Elts, AllowPoison))
2630 return std::nullopt;
2631
2632 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2633 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2634 for (size_t I = 0; I < Elts.size(); I++) {
2635 if (Elts[I] == nullptr)
2636 continue;
2637 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2638 IC.Builder.getInt64(I));
2639 }
2640 if (InsertEltChain == nullptr)
2641 return std::nullopt;
2642
2643 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2644 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2645 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2646 // be narrowed back to the original type.
2647 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2648 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2649 IIScalableTy->getMinNumElements() /
2650 PatternWidth;
2651
2652 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2653 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2654 auto *WideShuffleMaskTy =
2655 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2656
2657 auto InsertSubvector = IC.Builder.CreateInsertVector(
2658 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2659 uint64_t(0));
2660 auto WideBitcast =
2661 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2662 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2663 auto WideShuffle = IC.Builder.CreateShuffleVector(
2664 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2665 auto NarrowBitcast =
2666 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2667
2668 return IC.replaceInstUsesWith(II, NarrowBitcast);
2669}
2670
2671static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2672 IntrinsicInst &II) {
2673 Value *A = II.getArgOperand(0);
2674 Value *B = II.getArgOperand(1);
2675 if (A == B)
2676 return IC.replaceInstUsesWith(II, A);
2677
2678 return std::nullopt;
2679}
2680
2681static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2682 IntrinsicInst &II) {
2683 Value *Pred = II.getOperand(0);
2684 Value *Vec = II.getOperand(1);
2685 Value *Shift = II.getOperand(2);
2686
2687 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2688 Value *AbsPred, *MergedValue;
2690 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2692 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2693
2694 return std::nullopt;
2695
2696 // Transform is valid if any of the following are true:
2697 // * The ABS merge value is an undef or non-negative
2698 // * The ABS predicate is all active
2699 // * The ABS predicate and the SRSHL predicates are the same
2700 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2701 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2702 return std::nullopt;
2703
2704 // Only valid when the shift amount is non-negative, otherwise the rounding
2705 // behaviour of SRSHL cannot be ignored.
2706 if (!match(Shift, m_NonNegative()))
2707 return std::nullopt;
2708
2709 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2710 {II.getType()}, {Pred, Vec, Shift});
2711
2712 return IC.replaceInstUsesWith(II, LSL);
2713}
2714
2715static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2716 IntrinsicInst &II) {
2717 Value *Vec = II.getOperand(0);
2718
2719 if (getSplatValue(Vec) == II.getOperand(1))
2720 return IC.replaceInstUsesWith(II, Vec);
2721
2722 return std::nullopt;
2723}
2724
2725static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2726 IntrinsicInst &II) {
2727 // If this barrier is post-dominated by identical one we can remove it
2728 auto *NI = II.getNextNode();
2729 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2730 auto CanSkipOver = [](Instruction *I) {
2731 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2732 };
2733 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2734 auto *NIBB = NI->getParent();
2735 NI = NI->getNextNode();
2736 if (!NI) {
2737 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2738 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2739 else
2740 break;
2741 }
2742 }
2743 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2744 if (NextII && II.isIdenticalTo(NextII))
2745 return IC.eraseInstFromFunction(II);
2746
2747 return std::nullopt;
2748}
2749
2750static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2751 IntrinsicInst &II) {
2752 return IC.replaceInstUsesWith(
2753 II,
2754 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
2755 {II.getType(), II.getOperand(0)->getType()},
2756 {II.getOperand(0), II.getOperand(1)}));
2757}
2758
2759static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2760 IntrinsicInst &II) {
2762 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2763 return std::nullopt;
2764}
2765
2766static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2768 unsigned NumBits) {
2769 Value *Passthru = II.getOperand(0);
2770 Value *Pg = II.getOperand(1);
2771 Value *Op = II.getOperand(2);
2772
2773 // Convert UXT[BHW] to AND.
2774 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2775 auto *Ty = cast<VectorType>(II.getType());
2776 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2777 auto *Mask = ConstantInt::get(Ty, MaskValue);
2778 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2779 {Pg, Op, Mask});
2780 return IC.replaceInstUsesWith(II, And);
2781 }
2782
2783 return std::nullopt;
2784}
2785
2786static std::optional<Instruction *>
2788 SMEAttrs FnSMEAttrs(*II.getFunction());
2789 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2790 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
2791 return IC.replaceInstUsesWith(
2792 II, ConstantInt::getBool(II.getType(), IsStreaming));
2793 return std::nullopt;
2794}
2795
2796std::optional<Instruction *>
2798 IntrinsicInst &II) const {
2800 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2801 return I;
2802
2803 Intrinsic::ID IID = II.getIntrinsicID();
2804 switch (IID) {
2805 default:
2806 break;
2807 case Intrinsic::aarch64_dmb:
2808 return instCombineDMB(IC, II);
2809 case Intrinsic::aarch64_neon_fmaxnm:
2810 case Intrinsic::aarch64_neon_fminnm:
2811 return instCombineMaxMinNM(IC, II);
2812 case Intrinsic::aarch64_sve_convert_from_svbool:
2813 return instCombineConvertFromSVBool(IC, II);
2814 case Intrinsic::aarch64_sve_dup:
2815 return instCombineSVEDup(IC, II);
2816 case Intrinsic::aarch64_sve_dup_x:
2817 return instCombineSVEDupX(IC, II);
2818 case Intrinsic::aarch64_sve_cmpne:
2819 case Intrinsic::aarch64_sve_cmpne_wide:
2820 return instCombineSVECmpNE(IC, II);
2821 case Intrinsic::aarch64_sve_rdffr:
2822 return instCombineRDFFR(IC, II);
2823 case Intrinsic::aarch64_sve_lasta:
2824 case Intrinsic::aarch64_sve_lastb:
2825 return instCombineSVELast(IC, II);
2826 case Intrinsic::aarch64_sve_clasta_n:
2827 case Intrinsic::aarch64_sve_clastb_n:
2828 return instCombineSVECondLast(IC, II);
2829 case Intrinsic::aarch64_sve_cntd:
2830 return instCombineSVECntElts(IC, II, 2);
2831 case Intrinsic::aarch64_sve_cntw:
2832 return instCombineSVECntElts(IC, II, 4);
2833 case Intrinsic::aarch64_sve_cnth:
2834 return instCombineSVECntElts(IC, II, 8);
2835 case Intrinsic::aarch64_sve_cntb:
2836 return instCombineSVECntElts(IC, II, 16);
2837 case Intrinsic::aarch64_sme_cntsd:
2838 return instCombineSMECntsd(IC, II, ST);
2839 case Intrinsic::aarch64_sve_ptest_any:
2840 case Intrinsic::aarch64_sve_ptest_first:
2841 case Intrinsic::aarch64_sve_ptest_last:
2842 return instCombineSVEPTest(IC, II);
2843 case Intrinsic::aarch64_sve_fadd:
2844 return instCombineSVEVectorFAdd(IC, II);
2845 case Intrinsic::aarch64_sve_fadd_u:
2846 return instCombineSVEVectorFAddU(IC, II);
2847 case Intrinsic::aarch64_sve_fmul_u:
2848 return instCombineSVEVectorBinOp(IC, II);
2849 case Intrinsic::aarch64_sve_fsub:
2850 return instCombineSVEVectorFSub(IC, II);
2851 case Intrinsic::aarch64_sve_fsub_u:
2852 return instCombineSVEVectorFSubU(IC, II);
2853 case Intrinsic::aarch64_sve_add:
2854 return instCombineSVEVectorAdd(IC, II);
2855 case Intrinsic::aarch64_sve_add_u:
2856 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2857 Intrinsic::aarch64_sve_mla_u>(
2858 IC, II, true);
2859 case Intrinsic::aarch64_sve_sub:
2860 return instCombineSVEVectorSub(IC, II);
2861 case Intrinsic::aarch64_sve_sub_u:
2862 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2863 Intrinsic::aarch64_sve_mls_u>(
2864 IC, II, true);
2865 case Intrinsic::aarch64_sve_tbl:
2866 return instCombineSVETBL(IC, II);
2867 case Intrinsic::aarch64_sve_uunpkhi:
2868 case Intrinsic::aarch64_sve_uunpklo:
2869 case Intrinsic::aarch64_sve_sunpkhi:
2870 case Intrinsic::aarch64_sve_sunpklo:
2871 return instCombineSVEUnpack(IC, II);
2872 case Intrinsic::aarch64_sve_uzp1:
2873 return instCombineSVEUzp1(IC, II);
2874 case Intrinsic::aarch64_sve_zip1:
2875 case Intrinsic::aarch64_sve_zip2:
2876 return instCombineSVEZip(IC, II);
2877 case Intrinsic::aarch64_sve_ld1_gather_index:
2878 return instCombineLD1GatherIndex(IC, II);
2879 case Intrinsic::aarch64_sve_st1_scatter_index:
2880 return instCombineST1ScatterIndex(IC, II);
2881 case Intrinsic::aarch64_sve_ld1:
2882 return instCombineSVELD1(IC, II, DL);
2883 case Intrinsic::aarch64_sve_st1:
2884 return instCombineSVEST1(IC, II, DL);
2885 case Intrinsic::aarch64_sve_sdiv:
2886 return instCombineSVESDIV(IC, II);
2887 case Intrinsic::aarch64_sve_sel:
2888 return instCombineSVESel(IC, II);
2889 case Intrinsic::aarch64_sve_srshl:
2890 return instCombineSVESrshl(IC, II);
2891 case Intrinsic::aarch64_sve_dupq_lane:
2892 return instCombineSVEDupqLane(IC, II);
2893 case Intrinsic::aarch64_sve_insr:
2894 return instCombineSVEInsr(IC, II);
2895 case Intrinsic::aarch64_sve_whilelo:
2896 return instCombineWhilelo(IC, II);
2897 case Intrinsic::aarch64_sve_ptrue:
2898 return instCombinePTrue(IC, II);
2899 case Intrinsic::aarch64_sve_uxtb:
2900 return instCombineSVEUxt(IC, II, 8);
2901 case Intrinsic::aarch64_sve_uxth:
2902 return instCombineSVEUxt(IC, II, 16);
2903 case Intrinsic::aarch64_sve_uxtw:
2904 return instCombineSVEUxt(IC, II, 32);
2905 case Intrinsic::aarch64_sme_in_streaming_mode:
2906 return instCombineInStreamingMode(IC, II);
2907 }
2908
2909 return std::nullopt;
2910}
2911
2913 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2914 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2915 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2916 SimplifyAndSetOp) const {
2917 switch (II.getIntrinsicID()) {
2918 default:
2919 break;
2920 case Intrinsic::aarch64_neon_fcvtxn:
2921 case Intrinsic::aarch64_neon_rshrn:
2922 case Intrinsic::aarch64_neon_sqrshrn:
2923 case Intrinsic::aarch64_neon_sqrshrun:
2924 case Intrinsic::aarch64_neon_sqshrn:
2925 case Intrinsic::aarch64_neon_sqshrun:
2926 case Intrinsic::aarch64_neon_sqxtn:
2927 case Intrinsic::aarch64_neon_sqxtun:
2928 case Intrinsic::aarch64_neon_uqrshrn:
2929 case Intrinsic::aarch64_neon_uqshrn:
2930 case Intrinsic::aarch64_neon_uqxtn:
2931 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2932 break;
2933 }
2934
2935 return std::nullopt;
2936}
2937
2939 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2941}
2942
2945 switch (K) {
2947 return TypeSize::getFixed(64);
2949 if (ST->useSVEForFixedLengthVectors() &&
2950 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
2951 return TypeSize::getFixed(
2952 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2953 else if (ST->isNeonAvailable())
2954 return TypeSize::getFixed(128);
2955 else
2956 return TypeSize::getFixed(0);
2958 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2960 return TypeSize::getScalable(128);
2961 else
2962 return TypeSize::getScalable(0);
2963 }
2964 llvm_unreachable("Unsupported register kind");
2965}
2966
2967bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
2969 Type *SrcOverrideTy) const {
2970 // A helper that returns a vector type from the given type. The number of
2971 // elements in type Ty determines the vector width.
2972 auto toVectorTy = [&](Type *ArgTy) {
2973 return VectorType::get(ArgTy->getScalarType(),
2974 cast<VectorType>(DstTy)->getElementCount());
2975 };
2976
2977 // Exit early if DstTy is not a vector type whose elements are one of [i16,
2978 // i32, i64]. SVE doesn't generally have the same set of instructions to
2979 // perform an extend with the add/sub/mul. There are SMULLB style
2980 // instructions, but they operate on top/bottom, requiring some sort of lane
2981 // interleaving to be used with zext/sext.
2982 unsigned DstEltSize = DstTy->getScalarSizeInBits();
2983 if (!useNeonVector(DstTy) || Args.size() != 2 ||
2984 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2985 return false;
2986
2987 // Determine if the operation has a widening variant. We consider both the
2988 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2989 // instructions.
2990 //
2991 // TODO: Add additional widening operations (e.g., shl, etc.) once we
2992 // verify that their extending operands are eliminated during code
2993 // generation.
2994 Type *SrcTy = SrcOverrideTy;
2995 switch (Opcode) {
2996 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2997 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2998 // The second operand needs to be an extend
2999 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3000 if (!SrcTy)
3001 SrcTy =
3002 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3003 } else
3004 return false;
3005 break;
3006 case Instruction::Mul: { // SMULL(2), UMULL(2)
3007 // Both operands need to be extends of the same type.
3008 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3009 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3010 if (!SrcTy)
3011 SrcTy =
3012 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3013 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
3014 // If one of the operands is a Zext and the other has enough zero bits to
3015 // be treated as unsigned, we can still general a umull, meaning the zext
3016 // is free.
3017 KnownBits Known =
3018 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3019 if (Args[0]->getType()->getScalarSizeInBits() -
3020 Known.Zero.countLeadingOnes() >
3021 DstTy->getScalarSizeInBits() / 2)
3022 return false;
3023 if (!SrcTy)
3024 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
3025 DstTy->getScalarSizeInBits() / 2));
3026 } else
3027 return false;
3028 break;
3029 }
3030 default:
3031 return false;
3032 }
3033
3034 // Legalize the destination type and ensure it can be used in a widening
3035 // operation.
3036 auto DstTyL = getTypeLegalizationCost(DstTy);
3037 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3038 return false;
3039
3040 // Legalize the source type and ensure it can be used in a widening
3041 // operation.
3042 assert(SrcTy && "Expected some SrcTy");
3043 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3044 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3045 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3046 return false;
3047
3048 // Get the total number of vector elements in the legalized types.
3049 InstructionCost NumDstEls =
3050 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3051 InstructionCost NumSrcEls =
3052 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3053
3054 // Return true if the legalized types have the same number of vector elements
3055 // and the destination element type size is twice that of the source type.
3056 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3057}
3058
3059// s/urhadd instructions implement the following pattern, making the
3060// extends free:
3061// %x = add ((zext i8 -> i16), 1)
3062// %y = (zext i8 -> i16)
3063// trunc i16 (lshr (add %x, %y), 1) -> i8
3064//
3066 Type *Src) const {
3067 // The source should be a legal vector type.
3068 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3069 (Src->isScalableTy() && !ST->hasSVE2()))
3070 return false;
3071
3072 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3073 return false;
3074
3075 // Look for trunc/shl/add before trying to match the pattern.
3076 const Instruction *Add = ExtUser;
3077 auto *AddUser =
3078 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3079 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3080 Add = AddUser;
3081
3082 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3083 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3084 return false;
3085
3086 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3087 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3088 Src->getScalarSizeInBits() !=
3089 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3090 return false;
3091
3092 // Try to match the whole pattern. Ext could be either the first or second
3093 // m_ZExtOrSExt matched.
3094 Instruction *Ex1, *Ex2;
3095 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3096 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
3097 return false;
3098
3099 // Ensure both extends are of the same type
3100 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3101 Ex1->getOpcode() == Ex2->getOpcode())
3102 return true;
3103
3104 return false;
3105}
3106
3108 Type *Src,
3111 const Instruction *I) const {
3112 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3113 assert(ISD && "Invalid opcode");
3114 // If the cast is observable, and it is used by a widening instruction (e.g.,
3115 // uaddl, saddw, etc.), it may be free.
3116 if (I && I->hasOneUser()) {
3117 auto *SingleUser = cast<Instruction>(*I->user_begin());
3118 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3119 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
3120 // For adds only count the second operand as free if both operands are
3121 // extends but not the same operation. (i.e both operands are not free in
3122 // add(sext, zext)).
3123 if (SingleUser->getOpcode() == Instruction::Add) {
3124 if (I == SingleUser->getOperand(1) ||
3125 (isa<CastInst>(SingleUser->getOperand(1)) &&
3126 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3127 return 0;
3128 } else // Others are free so long as isWideningInstruction returned true.
3129 return 0;
3130 }
3131
3132 // The cast will be free for the s/urhadd instructions
3133 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3134 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3135 return 0;
3136 }
3137
3138 // TODO: Allow non-throughput costs that aren't binary.
3139 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
3141 return Cost == 0 ? 0 : 1;
3142 return Cost;
3143 };
3144
3145 EVT SrcTy = TLI->getValueType(DL, Src);
3146 EVT DstTy = TLI->getValueType(DL, Dst);
3147
3148 if (!SrcTy.isSimple() || !DstTy.isSimple())
3149 return AdjustCost(
3150 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3151
3152 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3153 // we use fcvtx under SVE2. Give them invalid costs.
3154 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3155 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3156 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3158
3159 static const TypeConversionCostTblEntry BF16Tbl[] = {
3160 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3161 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3162 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3163 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3164 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3165 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3166 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3167 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3168 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3169 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3170 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3171 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3172 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3173 };
3174
3175 if (ST->hasBF16())
3176 if (const auto *Entry = ConvertCostTableLookup(
3177 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3178 return AdjustCost(Entry->Cost);
3179
3180 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3181 // The cost of unpacking twice is artificially increased for now in order
3182 // to avoid regressions against NEON, which will use tbl instructions directly
3183 // instead of multiple layers of [s|u]unpk[lo|hi].
3184 // We use the unpacks in cases where the destination type is illegal and
3185 // requires splitting of the input, even if the input type itself is legal.
3186 const unsigned int SVE_EXT_COST = 1;
3187 const unsigned int SVE_FCVT_COST = 1;
3188 const unsigned int SVE_UNPACK_ONCE = 4;
3189 const unsigned int SVE_UNPACK_TWICE = 16;
3190
3191 static const TypeConversionCostTblEntry ConversionTbl[] = {
3192 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3193 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3194 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3195 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3196 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3197 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3198 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3199 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3200 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3201 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3202 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3203 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3204 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3205 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3206 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3207 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3208 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3209 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3210 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3211 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3212
3213 // Truncations on nxvmiN
3214 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3215 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3216 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3217 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3218 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3219 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3220 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3221 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3222 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3223 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3224 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3225 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3226 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3227 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3228 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3229 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3230 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3231 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3232 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3233 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3234 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3235 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3236 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3237 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3238 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3239 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3240 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3241 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3242 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3243 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3244 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3245 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3246 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3247
3248 // The number of shll instructions for the extension.
3249 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3250 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3251 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3252 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3253 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3254 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3255 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3256 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3257 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3258 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3259 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3260 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3261 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3262 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3263 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3264 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3265
3266 // FP Ext and trunc
3267 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3268 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3269 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3270 // FP16
3271 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3272 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3273 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3274 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3275 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3276 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3277 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3278 // BF16 (uses shift)
3279 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3280 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3281 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3282 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3283 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3284 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3285 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3286 // FP Ext and trunc
3287 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3288 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3289 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3290 // FP16
3291 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3292 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3293 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3294 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3295 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3296 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3297 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3298 // BF16 (more complex, with +bf16 is handled above)
3299 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3300 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3301 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3302 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3303 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3304 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3305 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3306 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3307
3308 // LowerVectorINT_TO_FP:
3309 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3310 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3311 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3312 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3313 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3314 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3315
3316 // SVE: to nxv2f16
3317 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3318 SVE_EXT_COST + SVE_FCVT_COST},
3319 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3320 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3321 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3322 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3323 SVE_EXT_COST + SVE_FCVT_COST},
3324 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3325 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3326 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3327
3328 // SVE: to nxv4f16
3329 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3330 SVE_EXT_COST + SVE_FCVT_COST},
3331 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3332 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3333 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3334 SVE_EXT_COST + SVE_FCVT_COST},
3335 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3336 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3337
3338 // SVE: to nxv8f16
3339 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3340 SVE_EXT_COST + SVE_FCVT_COST},
3341 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3342 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3343 SVE_EXT_COST + SVE_FCVT_COST},
3344 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3345
3346 // SVE: to nxv16f16
3347 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3348 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3349 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3350 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3351
3352 // Complex: to v2f32
3353 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3354 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3355 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3356 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3357
3358 // SVE: to nxv2f32
3359 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3360 SVE_EXT_COST + SVE_FCVT_COST},
3361 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3362 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3363 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3364 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3365 SVE_EXT_COST + SVE_FCVT_COST},
3366 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3367 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3368 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3369
3370 // Complex: to v4f32
3371 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3372 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3373 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3374 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3375
3376 // SVE: to nxv4f32
3377 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3378 SVE_EXT_COST + SVE_FCVT_COST},
3379 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3380 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3381 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3382 SVE_EXT_COST + SVE_FCVT_COST},
3383 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3384 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3385
3386 // Complex: to v8f32
3387 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3388 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3389 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3390 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3391
3392 // SVE: to nxv8f32
3393 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3394 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3395 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3396 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3397 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3398 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3399 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3400 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3401
3402 // SVE: to nxv16f32
3403 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3404 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3405 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3406 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3407
3408 // Complex: to v16f32
3409 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3410 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3411
3412 // Complex: to v2f64
3413 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3414 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3415 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3416 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3417 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3418 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3419
3420 // SVE: to nxv2f64
3421 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3422 SVE_EXT_COST + SVE_FCVT_COST},
3423 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3424 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3425 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3426 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3427 SVE_EXT_COST + SVE_FCVT_COST},
3428 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3429 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3430 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3431
3432 // Complex: to v4f64
3433 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3434 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3435
3436 // SVE: to nxv4f64
3437 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3438 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3439 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3440 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3441 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3442 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3443 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3444 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3445 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3446 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3447 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3448 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3449
3450 // SVE: to nxv8f64
3451 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3452 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3453 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3454 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3455 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3456 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3457 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3458 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3459
3460 // LowerVectorFP_TO_INT
3461 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3462 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3463 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3464 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3465 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3466 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3467
3468 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3469 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3470 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3471 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3472 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3473 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3474 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3475
3476 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3477 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3478 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3479 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3480 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3481
3482 // Complex, from nxv2f32.
3483 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3484 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3485 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3486 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3487 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3488 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3489 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3490 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3491
3492 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3493 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3494 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3495 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3496 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3497 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3498 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3499
3500 // Complex, from nxv2f64.
3501 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3502 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3503 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3504 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3505 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3506 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3507 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3508 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3509 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3510 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3511
3512 // Complex, from nxv4f32.
3513 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3514 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3515 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3516 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3517 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3518 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3519 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3520 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3521 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3522 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3523
3524 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3525 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3526 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3527 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3528 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3529
3530 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3531 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3532 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3533 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3534 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3535 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3536 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3537
3538 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3539 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3540 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3541 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3542 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3543
3544 // Complex, from nxv8f16.
3545 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3546 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3547 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3548 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3549 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3550 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3551 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3552 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3553 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3554 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3555
3556 // Complex, from nxv4f16.
3557 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3558 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3559 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3560 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3561 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3562 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3563 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3564 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3565
3566 // Complex, from nxv2f16.
3567 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3568 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3569 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3570 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3571 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3572 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3573 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3574 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3575
3576 // Truncate from nxvmf32 to nxvmf16.
3577 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3578 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3579 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3580
3581 // Truncate from nxvmf32 to nxvmbf16.
3582 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3583 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3584 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3585
3586 // Truncate from nxvmf64 to nxvmf16.
3587 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3588 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3589 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3590
3591 // Truncate from nxvmf64 to nxvmbf16.
3592 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3593 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3594 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3595
3596 // Truncate from nxvmf64 to nxvmf32.
3597 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3598 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3599 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3600
3601 // Extend from nxvmf16 to nxvmf32.
3602 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3603 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3604 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3605
3606 // Extend from nxvmbf16 to nxvmf32.
3607 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
3608 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
3609 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
3610
3611 // Extend from nxvmf16 to nxvmf64.
3612 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3613 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3614 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3615
3616 // Extend from nxvmbf16 to nxvmf64.
3617 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
3618 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
3619 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
3620
3621 // Extend from nxvmf32 to nxvmf64.
3622 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3623 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3624 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3625
3626 // Bitcasts from float to integer
3627 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3628 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3629 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3630
3631 // Bitcasts from integer to float
3632 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3633 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3634 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3635
3636 // Add cost for extending to illegal -too wide- scalable vectors.
3637 // zero/sign extend are implemented by multiple unpack operations,
3638 // where each operation has a cost of 1.
3639 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3640 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3641 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3642 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3643 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3644 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3645
3646 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3647 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3648 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3649 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3650 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3651 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3652 };
3653
3654 // We have to estimate a cost of fixed length operation upon
3655 // SVE registers(operations) with the number of registers required
3656 // for a fixed type to be represented upon SVE registers.
3657 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3658 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3659 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3660 ST->useSVEForFixedLengthVectors(WiderTy)) {
3661 std::pair<InstructionCost, MVT> LT =
3662 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3663 unsigned NumElements =
3664 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3665 return AdjustCost(
3666 LT.first *
3668 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
3669 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3670 CostKind, I));
3671 }
3672
3673 if (const auto *Entry = ConvertCostTableLookup(
3674 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3675 return AdjustCost(Entry->Cost);
3676
3677 static const TypeConversionCostTblEntry FP16Tbl[] = {
3678 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3679 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3680 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3681 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3682 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3683 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3684 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3685 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3686 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3687 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3688 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3689 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3690 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3691 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3692 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3693 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3694 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3695 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3696 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3697 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3698 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3699 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3700 };
3701
3702 if (ST->hasFullFP16())
3703 if (const auto *Entry = ConvertCostTableLookup(
3704 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3705 return AdjustCost(Entry->Cost);
3706
3707 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3708 // double-rounding issues.
3709 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3710 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3712 return AdjustCost(
3714 getCastInstrCost(Opcode, Dst->getScalarType(), Src->getScalarType(),
3715 CCH, CostKind) +
3717 CostKind) +
3719 CostKind));
3720
3721 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3723 ST->isSVEorStreamingSVEAvailable() &&
3724 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3726 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3728 // The standard behaviour in the backend for these cases is to split the
3729 // extend up into two parts:
3730 // 1. Perform an extending load or masked load up to the legal type.
3731 // 2. Extend the loaded data to the final type.
3732 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3733 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3735 Opcode, LegalTy, Src, CCH, CostKind, I);
3737 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3738 return Part1 + Part2;
3739 }
3740
3741 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3742 // but we also want to include the TTI::CastContextHint::Masked case too.
3743 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3745 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3747
3748 return AdjustCost(
3749 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3750}
3751
3754 VectorType *VecTy, unsigned Index,
3756
3757 // Make sure we were given a valid extend opcode.
3758 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3759 "Invalid opcode");
3760
3761 // We are extending an element we extract from a vector, so the source type
3762 // of the extend is the element type of the vector.
3763 auto *Src = VecTy->getElementType();
3764
3765 // Sign- and zero-extends are for integer types only.
3766 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3767
3768 // Get the cost for the extract. We compute the cost (if any) for the extend
3769 // below.
3770 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3771 CostKind, Index, nullptr, nullptr);
3772
3773 // Legalize the types.
3774 auto VecLT = getTypeLegalizationCost(VecTy);
3775 auto DstVT = TLI->getValueType(DL, Dst);
3776 auto SrcVT = TLI->getValueType(DL, Src);
3777
3778 // If the resulting type is still a vector and the destination type is legal,
3779 // we may get the extension for free. If not, get the default cost for the
3780 // extend.
3781 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3782 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3783 CostKind);
3784
3785 // The destination type should be larger than the element type. If not, get
3786 // the default cost for the extend.
3787 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3788 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3789 CostKind);
3790
3791 switch (Opcode) {
3792 default:
3793 llvm_unreachable("Opcode should be either SExt or ZExt");
3794
3795 // For sign-extends, we only need a smov, which performs the extension
3796 // automatically.
3797 case Instruction::SExt:
3798 return Cost;
3799
3800 // For zero-extends, the extend is performed automatically by a umov unless
3801 // the destination type is i64 and the element type is i8 or i16.
3802 case Instruction::ZExt:
3803 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3804 return Cost;
3805 }
3806
3807 // If we are unable to perform the extend for free, get the default cost.
3808 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3809 CostKind);
3810}
3811
3814 const Instruction *I) const {
3816 return Opcode == Instruction::PHI ? 0 : 1;
3817 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3818 // Branches are assumed to be predicted.
3819 return 0;
3820}
3821
3822InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3823 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3824 const Instruction *I, Value *Scalar,
3825 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
3826 assert(Val->isVectorTy() && "This must be a vector type");
3827
3828 if (Index != -1U) {
3829 // Legalize the type.
3830 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3831
3832 // This type is legalized to a scalar type.
3833 if (!LT.second.isVector())
3834 return 0;
3835
3836 // The type may be split. For fixed-width vectors we can normalize the
3837 // index to the new type.
3838 if (LT.second.isFixedLengthVector()) {
3839 unsigned Width = LT.second.getVectorNumElements();
3840 Index = Index % Width;
3841 }
3842
3843 // The element at index zero is already inside the vector.
3844 // - For a insert-element or extract-element
3845 // instruction that extracts integers, an explicit FPR -> GPR move is
3846 // needed. So it has non-zero cost.
3847 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
3848 return 0;
3849
3850 // This is recognising a LD1 single-element structure to one lane of one
3851 // register instruction. I.e., if this is an `insertelement` instruction,
3852 // and its second operand is a load, then we will generate a LD1, which
3853 // are expensive instructions.
3854 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
3855 return CostKind == TTI::TCK_CodeSize
3856 ? 0
3858
3859 // i1 inserts and extract will include an extra cset or cmp of the vector
3860 // value. Increase the cost by 1 to account.
3861 if (Val->getScalarSizeInBits() == 1)
3862 return CostKind == TTI::TCK_CodeSize
3863 ? 2
3865
3866 // FIXME:
3867 // If the extract-element and insert-element instructions could be
3868 // simplified away (e.g., could be combined into users by looking at use-def
3869 // context), they have no cost. This is not done in the first place for
3870 // compile-time considerations.
3871 }
3872
3873 // In case of Neon, if there exists extractelement from lane != 0 such that
3874 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
3875 // 2. extractelement result feeds into fmul.
3876 // 3. Other operand of fmul is an extractelement from lane 0 or lane
3877 // equivalent to 0.
3878 // then the extractelement can be merged with fmul in the backend and it
3879 // incurs no cost.
3880 // e.g.
3881 // define double @foo(<2 x double> %a) {
3882 // %1 = extractelement <2 x double> %a, i32 0
3883 // %2 = extractelement <2 x double> %a, i32 1
3884 // %res = fmul double %1, %2
3885 // ret double %res
3886 // }
3887 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
3888 auto ExtractCanFuseWithFmul = [&]() {
3889 // We bail out if the extract is from lane 0.
3890 if (Index == 0)
3891 return false;
3892
3893 // Check if the scalar element type of the vector operand of ExtractElement
3894 // instruction is one of the allowed types.
3895 auto IsAllowedScalarTy = [&](const Type *T) {
3896 return T->isFloatTy() || T->isDoubleTy() ||
3897 (T->isHalfTy() && ST->hasFullFP16());
3898 };
3899
3900 // Check if the extractelement user is scalar fmul.
3901 auto IsUserFMulScalarTy = [](const Value *EEUser) {
3902 // Check if the user is scalar fmul.
3903 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
3904 return BO && BO->getOpcode() == BinaryOperator::FMul &&
3905 !BO->getType()->isVectorTy();
3906 };
3907
3908 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
3909 // certain scalar type and a certain vector register width.
3910 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
3911 auto RegWidth =
3913 .getFixedValue();
3914 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
3915 };
3916
3917 // Check if the type constraints on input vector type and result scalar type
3918 // of extractelement instruction are satisfied.
3919 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
3920 return false;
3921
3922 if (Scalar) {
3923 DenseMap<User *, unsigned> UserToExtractIdx;
3924 for (auto *U : Scalar->users()) {
3925 if (!IsUserFMulScalarTy(U))
3926 return false;
3927 // Recording entry for the user is important. Index value is not
3928 // important.
3929 UserToExtractIdx[U];
3930 }
3931 if (UserToExtractIdx.empty())
3932 return false;
3933 for (auto &[S, U, L] : ScalarUserAndIdx) {
3934 for (auto *U : S->users()) {
3935 if (UserToExtractIdx.contains(U)) {
3936 auto *FMul = cast<BinaryOperator>(U);
3937 auto *Op0 = FMul->getOperand(0);
3938 auto *Op1 = FMul->getOperand(1);
3939 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
3940 UserToExtractIdx[U] = L;
3941 break;
3942 }
3943 }
3944 }
3945 }
3946 for (auto &[U, L] : UserToExtractIdx) {
3947 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
3948 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
3949 return false;
3950 }
3951 } else {
3952 const auto *EE = cast<ExtractElementInst>(I);
3953
3954 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
3955 if (!IdxOp)
3956 return false;
3957
3958 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
3959 if (!IsUserFMulScalarTy(U))
3960 return false;
3961
3962 // Check if the other operand of extractelement is also extractelement
3963 // from lane equivalent to 0.
3964 const auto *BO = cast<BinaryOperator>(U);
3965 const auto *OtherEE = dyn_cast<ExtractElementInst>(
3966 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
3967 if (OtherEE) {
3968 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
3969 if (!IdxOp)
3970 return false;
3971 return IsExtractLaneEquivalentToZero(
3972 cast<ConstantInt>(OtherEE->getIndexOperand())
3973 ->getValue()
3974 .getZExtValue(),
3975 OtherEE->getType()->getScalarSizeInBits());
3976 }
3977 return true;
3978 });
3979 }
3980 return true;
3981 };
3982
3983 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
3984 ExtractCanFuseWithFmul())
3985 return 0;
3986
3987 // All other insert/extracts cost this much.
3988 return CostKind == TTI::TCK_CodeSize ? 1
3989 : ST->getVectorInsertExtractBaseCost();
3990}
3991
3994 unsigned Index,
3995 const Value *Op0,
3996 const Value *Op1) const {
3997 // Treat insert at lane 0 into a poison vector as having zero cost. This
3998 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
3999 // single dup) are treated as cheap.
4000 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4001 isa<PoisonValue>(Op0))
4002 return 0;
4003 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
4004}
4005
4007 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4008 Value *Scalar,
4009 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
4010 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4011 ScalarUserAndIdx);
4012}
4013
4015 Type *Val,
4017 unsigned Index) const {
4018 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
4019}
4020
4024 unsigned Index) const {
4025 if (isa<FixedVectorType>(Val))
4027 Index);
4028
4029 // This typically requires both while and lastb instructions in order
4030 // to extract the last element. If this is in a loop the while
4031 // instruction can at least be hoisted out, although it will consume a
4032 // predicate register. The cost should be more expensive than the base
4033 // extract cost, which is 2 for most CPUs.
4034 return CostKind == TTI::TCK_CodeSize
4035 ? 2
4036 : ST->getVectorInsertExtractBaseCost() + 1;
4037}
4038
4040 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4041 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
4042 ArrayRef<Value *> VL) const {
4045 if (Ty->getElementType()->isFloatingPointTy())
4046 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4047 CostKind);
4048 unsigned VecInstCost =
4049 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4050 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4051}
4052
4053std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4055 TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
4056 std::function<InstructionCost(Type *)> InstCost) const {
4057 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4058 return std::nullopt;
4059 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4060 return std::nullopt;
4061
4062 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4063 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4065 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4066 Cost *= 2;
4067 Cost += InstCost(PromotedTy);
4068 if (IncludeTrunc)
4069 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4071 return Cost;
4072}
4073
4075 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4077 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4078
4079 // The code-generator is currently not able to handle scalable vectors
4080 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4081 // it. This change will be removed when code-generation for these types is
4082 // sufficiently reliable.
4083 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4084 if (VTy->getElementCount() == ElementCount::getScalable(1))
4086
4087 // TODO: Handle more cost kinds.
4089 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4090 Op2Info, Args, CxtI);
4091
4092 // Legalize the type.
4093 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4094 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4095
4096 // Increase the cost for half and bfloat types if not architecturally
4097 // supported.
4098 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4099 ISD == ISD::FDIV || ISD == ISD::FREM)
4100 if (auto PromotedCost = getFP16BF16PromoteCost(
4101 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4102 [&](Type *PromotedTy) {
4103 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4104 Op1Info, Op2Info);
4105 }))
4106 return *PromotedCost;
4107
4108 switch (ISD) {
4109 default:
4110 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4111 Op2Info);
4112 case ISD::SREM:
4113 case ISD::SDIV:
4114 /*
4115 Notes for sdiv/srem specific costs:
4116 1. This only considers the cases where the divisor is constant, uniform and
4117 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4118 result in some form of (ldr + adrp), corresponding to constant vectors, or
4119 scalarization of the division operation.
4120 2. Constant divisors, either negative in whole or partially, don't result in
4121 significantly different codegen as compared to positive constant divisors.
4122 So, we don't consider negative divisors separately.
4123 3. If the codegen is significantly different with SVE, it has been indicated
4124 using comments at appropriate places.
4125
4126 sdiv specific cases:
4127 -----------------------------------------------------------------------
4128 codegen | pow-of-2 | Type
4129 -----------------------------------------------------------------------
4130 add + cmp + csel + asr | Y | i64
4131 add + cmp + csel + asr | Y | i32
4132 -----------------------------------------------------------------------
4133
4134 srem specific cases:
4135 -----------------------------------------------------------------------
4136 codegen | pow-of-2 | Type
4137 -----------------------------------------------------------------------
4138 negs + and + and + csneg | Y | i64
4139 negs + and + and + csneg | Y | i32
4140 -----------------------------------------------------------------------
4141
4142 other sdiv/srem cases:
4143 -------------------------------------------------------------------------
4144 common codegen | + srem | + sdiv | pow-of-2 | Type
4145 -------------------------------------------------------------------------
4146 smulh + asr + add + add | - | - | N | i64
4147 smull + lsr + add + add | - | - | N | i32
4148 usra | and + sub | sshr | Y | <2 x i64>
4149 2 * (scalar code) | - | - | N | <2 x i64>
4150 usra | bic + sub | sshr + neg | Y | <4 x i32>
4151 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4152 + sshr + usra | | | |
4153 -------------------------------------------------------------------------
4154 */
4155 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4156 InstructionCost AddCost =
4157 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4158 Op1Info.getNoProps(), Op2Info.getNoProps());
4159 InstructionCost AsrCost =
4160 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4161 Op1Info.getNoProps(), Op2Info.getNoProps());
4162 InstructionCost MulCost =
4163 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4164 Op1Info.getNoProps(), Op2Info.getNoProps());
4165 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4166 // have similar cost.
4167 auto VT = TLI->getValueType(DL, Ty);
4168 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4169 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4170 // Neg can be folded into the asr instruction.
4171 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4172 : (3 * AsrCost + AddCost);
4173 } else {
4174 return MulCost + AsrCost + 2 * AddCost;
4175 }
4176 } else if (VT.isVector()) {
4177 InstructionCost UsraCost = 2 * AsrCost;
4178 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4179 // Division with scalable types corresponds to native 'asrd'
4180 // instruction when SVE is available.
4181 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4182
4183 // One more for the negation in SDIV
4185 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4186 if (Ty->isScalableTy() && ST->hasSVE())
4187 Cost += 2 * AsrCost;
4188 else {
4189 Cost +=
4190 UsraCost +
4191 (ISD == ISD::SDIV
4192 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4193 : 2 * AddCost);
4194 }
4195 return Cost;
4196 } else if (LT.second == MVT::v2i64) {
4197 return VT.getVectorNumElements() *
4198 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4199 Op1Info.getNoProps(),
4200 Op2Info.getNoProps());
4201 } else {
4202 // When SVE is available, we get:
4203 // smulh + lsr + add/sub + asr + add/sub.
4204 if (Ty->isScalableTy() && ST->hasSVE())
4205 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4206 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4207 }
4208 }
4209 }
4210 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4211 LT.second.isFixedLengthVector()) {
4212 // FIXME: When the constant vector is non-uniform, this may result in
4213 // loading the vector from constant pool or in some cases, may also result
4214 // in scalarization. For now, we are approximating this with the
4215 // scalarization cost.
4216 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4217 CostKind, -1, nullptr, nullptr);
4218 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4219 CostKind, -1, nullptr, nullptr);
4220 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4221 return ExtractCost + InsertCost +
4222 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4223 CostKind, Op1Info.getNoProps(),
4224 Op2Info.getNoProps());
4225 }
4226 [[fallthrough]];
4227 case ISD::UDIV:
4228 case ISD::UREM: {
4229 auto VT = TLI->getValueType(DL, Ty);
4230 if (Op2Info.isConstant()) {
4231 // If the operand is a power of 2 we can use the shift or and cost.
4232 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4233 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4234 Op1Info.getNoProps(),
4235 Op2Info.getNoProps());
4236 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4237 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4238 Op1Info.getNoProps(),
4239 Op2Info.getNoProps());
4240
4241 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4242 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4243 // The MULHU will be expanded to UMULL for the types not listed below,
4244 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4245 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4246 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4247 LT.second == MVT::nxv16i8;
4248 bool Is128bit = LT.second.is128BitVector();
4249
4250 InstructionCost MulCost =
4251 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4252 Op1Info.getNoProps(), Op2Info.getNoProps());
4253 InstructionCost AddCost =
4254 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4255 Op1Info.getNoProps(), Op2Info.getNoProps());
4256 InstructionCost ShrCost =
4257 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4258 Op1Info.getNoProps(), Op2Info.getNoProps());
4259 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4260 (HasMULH ? 0 : ShrCost) + // UMULL shift
4261 AddCost * 2 + ShrCost;
4262 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4263 }
4264 }
4265
4266 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4267 // emitted by the backend even when those functions are not declared in the
4268 // module.
4269 if (!VT.isVector() && VT.getSizeInBits() > 64)
4270 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4271
4273 Opcode, Ty, CostKind, Op1Info, Op2Info);
4274 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4275 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4276 // SDIV/UDIV operations are lowered using SVE, then we can have less
4277 // costs.
4278 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4279 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4280 static const CostTblEntry DivTbl[]{
4281 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4282 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4283 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4284 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4285 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4286 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4287
4288 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4289 if (nullptr != Entry)
4290 return Entry->Cost;
4291 }
4292 // For 8/16-bit elements, the cost is higher because the type
4293 // requires promotion and possibly splitting:
4294 if (LT.second.getScalarType() == MVT::i8)
4295 Cost *= 8;
4296 else if (LT.second.getScalarType() == MVT::i16)
4297 Cost *= 4;
4298 return Cost;
4299 } else {
4300 // If one of the operands is a uniform constant then the cost for each
4301 // element is Cost for insertion, extraction and division.
4302 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4303 // operation with scalar type
4304 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4305 (Op2Info.isConstant() && Op2Info.isUniform())) {
4306 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4308 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4309 return (4 + DivCost) * VTy->getNumElements();
4310 }
4311 }
4312 // On AArch64, without SVE, vector divisions are expanded
4313 // into scalar divisions of each pair of elements.
4314 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4315 -1, nullptr, nullptr);
4316 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4317 nullptr, nullptr);
4318 }
4319
4320 // TODO: if one of the arguments is scalar, then it's not necessary to
4321 // double the cost of handling the vector elements.
4322 Cost += Cost;
4323 }
4324 return Cost;
4325 }
4326 case ISD::MUL:
4327 // When SVE is available, then we can lower the v2i64 operation using
4328 // the SVE mul instruction, which has a lower cost.
4329 if (LT.second == MVT::v2i64 && ST->hasSVE())
4330 return LT.first;
4331
4332 // When SVE is not available, there is no MUL.2d instruction,
4333 // which means mul <2 x i64> is expensive as elements are extracted
4334 // from the vectors and the muls scalarized.
4335 // As getScalarizationOverhead is a bit too pessimistic, we
4336 // estimate the cost for a i64 vector directly here, which is:
4337 // - four 2-cost i64 extracts,
4338 // - two 2-cost i64 inserts, and
4339 // - two 1-cost muls.
4340 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4341 // LT.first = 2 the cost is 28. If both operands are extensions it will not
4342 // need to scalarize so the cost can be cheaper (smull or umull).
4343 // so the cost can be cheaper (smull or umull).
4344 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
4345 return LT.first;
4346 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4347 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4348 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4349 nullptr, nullptr) *
4350 2 +
4351 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4352 nullptr, nullptr));
4353 case ISD::ADD:
4354 case ISD::XOR:
4355 case ISD::OR:
4356 case ISD::AND:
4357 case ISD::SRL:
4358 case ISD::SRA:
4359 case ISD::SHL:
4360 // These nodes are marked as 'custom' for combining purposes only.
4361 // We know that they are legal. See LowerAdd in ISelLowering.
4362 return LT.first;
4363
4364 case ISD::FNEG:
4365 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4366 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4367 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4368 CxtI &&
4369 ((CxtI->hasOneUse() &&
4370 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4371 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4372 return 0;
4373 [[fallthrough]];
4374 case ISD::FADD:
4375 case ISD::FSUB:
4376 if (!Ty->getScalarType()->isFP128Ty())
4377 return LT.first;
4378 [[fallthrough]];
4379 case ISD::FMUL:
4380 case ISD::FDIV:
4381 // These nodes are marked as 'custom' just to lower them to SVE.
4382 // We know said lowering will incur no additional cost.
4383 if (!Ty->getScalarType()->isFP128Ty())
4384 return 2 * LT.first;
4385
4386 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4387 Op2Info);
4388 case ISD::FREM:
4389 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4390 // those functions are not declared in the module.
4391 if (!Ty->isVectorTy())
4392 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4393 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4394 Op2Info);
4395 }
4396}
4397
4400 const SCEV *Ptr,
4402 // Address computations in vectorized code with non-consecutive addresses will
4403 // likely result in more instructions compared to scalar code where the
4404 // computation can more often be merged into the index mode. The resulting
4405 // extra micro-ops can significantly decrease throughput.
4406 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4407 int MaxMergeDistance = 64;
4408
4409 if (PtrTy->isVectorTy() && SE &&
4410 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4411 return NumVectorInstToHideOverhead;
4412
4413 // In many cases the address computation is not merged into the instruction
4414 // addressing mode.
4415 return 1;
4416}
4417
4418/// Check whether Opcode1 has less throughput according to the scheduling
4419/// model than Opcode2.
4421 unsigned Opcode1, unsigned Opcode2) const {
4422 const MCSchedModel &Sched = ST->getSchedModel();
4423 const TargetInstrInfo *TII = ST->getInstrInfo();
4424 if (!Sched.hasInstrSchedModel())
4425 return false;
4426
4427 const MCSchedClassDesc *SCD1 =
4428 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4429 const MCSchedClassDesc *SCD2 =
4430 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4431 // We cannot handle variant scheduling classes without an MI. If we need to
4432 // support them for any of the instructions we query the information of we
4433 // might need to add a way to resolve them without a MI or not use the
4434 // scheduling info.
4435 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4436 "Cannot handle variant scheduling classes without an MI");
4437 if (!SCD1->isValid() || !SCD2->isValid())
4438 return false;
4439
4440 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4442}
4443
4445 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4447 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4448 // We don't lower some vector selects well that are wider than the register
4449 // width. TODO: Improve this with different cost kinds.
4450 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4451 // We would need this many instructions to hide the scalarization happening.
4452 const int AmortizationCost = 20;
4453
4454 // If VecPred is not set, check if we can get a predicate from the context
4455 // instruction, if its type matches the requested ValTy.
4456 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4457 CmpPredicate CurrentPred;
4458 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4459 m_Value())))
4460 VecPred = CurrentPred;
4461 }
4462 // Check if we have a compare/select chain that can be lowered using
4463 // a (F)CMxx & BFI pair.
4464 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4465 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4466 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4467 VecPred == CmpInst::FCMP_UNE) {
4468 static const auto ValidMinMaxTys = {
4469 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4470 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4471 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4472
4473 auto LT = getTypeLegalizationCost(ValTy);
4474 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
4475 (ST->hasFullFP16() &&
4476 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
4477 return LT.first;
4478 }
4479
4480 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4481 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4482 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4483 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4484 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4485 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4486 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4487 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4488 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4489 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4490 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4491 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4492
4493 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4494 EVT SelValTy = TLI->getValueType(DL, ValTy);
4495 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4496 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4497 SelCondTy.getSimpleVT(),
4498 SelValTy.getSimpleVT()))
4499 return Entry->Cost;
4500 }
4501 }
4502
4503 if (Opcode == Instruction::FCmp) {
4504 if (auto PromotedCost = getFP16BF16PromoteCost(
4505 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4506 [&](Type *PromotedTy) {
4508 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4509 CostKind, Op1Info, Op2Info);
4510 if (isa<VectorType>(PromotedTy))
4512 Instruction::Trunc,
4516 return Cost;
4517 }))
4518 return *PromotedCost;
4519
4520 auto LT = getTypeLegalizationCost(ValTy);
4521 // Model unknown fp compares as a libcall.
4522 if (LT.second.getScalarType() != MVT::f64 &&
4523 LT.second.getScalarType() != MVT::f32 &&
4524 LT.second.getScalarType() != MVT::f16)
4525 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4526 {ValTy, ValTy}, CostKind);
4527
4528 // Some comparison operators require expanding to multiple compares + or.
4529 unsigned Factor = 1;
4530 if (!CondTy->isVectorTy() &&
4531 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4532 Factor = 2; // fcmp with 2 selects
4533 else if (isa<FixedVectorType>(ValTy) &&
4534 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4535 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4536 Factor = 3; // fcmxx+fcmyy+or
4537 else if (isa<ScalableVectorType>(ValTy) &&
4538 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4539 Factor = 3; // fcmxx+fcmyy+or
4540
4541 if (isa<ScalableVectorType>(ValTy) &&
4543 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4544 AArch64::FCMEQv4f32))
4545 Factor *= 2;
4546
4547 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4548 }
4549
4550 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4551 // icmp(and, 0) as free, as we can make use of ands, but only if the
4552 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4553 // providing it will not cause performance regressions.
4554 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4555 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4556 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4557 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4558 if (match(I->getOperand(1), m_Zero()))
4559 return 0;
4560
4561 // x >= 1 / x < 1 -> x > 0 / x <= 0
4562 if (match(I->getOperand(1), m_One()) &&
4563 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4564 return 0;
4565
4566 // x <= -1 / x > -1 -> x > 0 / x <= 0
4567 if (match(I->getOperand(1), m_AllOnes()) &&
4568 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4569 return 0;
4570 }
4571
4572 // The base case handles scalable vectors fine for now, since it treats the
4573 // cost as 1 * legalization cost.
4574 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4575 Op1Info, Op2Info, I);
4576}
4577
4579AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4581 if (ST->requiresStrictAlign()) {
4582 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4583 // a bunch of instructions when strict align is enabled.
4584 return Options;
4585 }
4586 Options.AllowOverlappingLoads = true;
4587 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4588 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4589 // TODO: Though vector loads usually perform well on AArch64, in some targets
4590 // they may wake up the FP unit, which raises the power consumption. Perhaps
4591 // they could be used with no holds barred (-O3).
4592 Options.LoadSizes = {8, 4, 2, 1};
4593 Options.AllowedTailExpansions = {3, 5, 6};
4594 return Options;
4595}
4596
4598 return ST->hasSVE();
4599}
4600
4603 Align Alignment, unsigned AddressSpace,
4605 if (useNeonVector(Src))
4606 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4607 CostKind);
4608 auto LT = getTypeLegalizationCost(Src);
4609 if (!LT.first.isValid())
4611
4612 // Return an invalid cost for element types that we are unable to lower.
4613 auto *VT = cast<VectorType>(Src);
4614 if (VT->getElementType()->isIntegerTy(1))
4616
4617 // The code-generator is currently not able to handle scalable vectors
4618 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4619 // it. This change will be removed when code-generation for these types is
4620 // sufficiently reliable.
4621 if (VT->getElementCount() == ElementCount::getScalable(1))
4623
4624 return LT.first;
4625}
4626
4627// This function returns gather/scatter overhead either from
4628// user-provided value or specialized values per-target from \p ST.
4629static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4630 const AArch64Subtarget *ST) {
4631 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4632 "Should be called on only load or stores.");
4633 switch (Opcode) {
4634 case Instruction::Load:
4635 if (SVEGatherOverhead.getNumOccurrences() > 0)
4636 return SVEGatherOverhead;
4637 return ST->getGatherOverhead();
4638 break;
4639 case Instruction::Store:
4640 if (SVEScatterOverhead.getNumOccurrences() > 0)
4641 return SVEScatterOverhead;
4642 return ST->getScatterOverhead();
4643 break;
4644 default:
4645 llvm_unreachable("Shouldn't have reached here");
4646 }
4647}
4648
4650 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
4651 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
4652 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
4653 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
4654 Alignment, CostKind, I);
4655 auto *VT = cast<VectorType>(DataTy);
4656 auto LT = getTypeLegalizationCost(DataTy);
4657 if (!LT.first.isValid())
4659
4660 // Return an invalid cost for element types that we are unable to lower.
4661 if (!LT.second.isVector() ||
4662 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
4663 VT->getElementType()->isIntegerTy(1))
4665
4666 // The code-generator is currently not able to handle scalable vectors
4667 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4668 // it. This change will be removed when code-generation for these types is
4669 // sufficiently reliable.
4670 if (VT->getElementCount() == ElementCount::getScalable(1))
4672
4673 ElementCount LegalVF = LT.second.getVectorElementCount();
4674 InstructionCost MemOpCost =
4675 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
4676 {TTI::OK_AnyValue, TTI::OP_None}, I);
4677 // Add on an overhead cost for using gathers/scatters.
4678 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4679 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
4680}
4681
4683 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
4684}
4685
4687 Align Alignment,
4688 unsigned AddressSpace,
4690 TTI::OperandValueInfo OpInfo,
4691 const Instruction *I) const {
4692 EVT VT = TLI->getValueType(DL, Ty, true);
4693 // Type legalization can't handle structs
4694 if (VT == MVT::Other)
4695 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
4696 CostKind);
4697
4698 auto LT = getTypeLegalizationCost(Ty);
4699 if (!LT.first.isValid())
4701
4702 // The code-generator is currently not able to handle scalable vectors
4703 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4704 // it. This change will be removed when code-generation for these types is
4705 // sufficiently reliable.
4706 // We also only support full register predicate loads and stores.
4707 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4708 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
4709 (VTy->getElementType()->isIntegerTy(1) &&
4710 !VTy->getElementCount().isKnownMultipleOf(
4713
4714 // TODO: consider latency as well for TCK_SizeAndLatency.
4716 return LT.first;
4717
4719 return 1;
4720
4721 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
4722 LT.second.is128BitVector() && Alignment < Align(16)) {
4723 // Unaligned stores are extremely inefficient. We don't split all
4724 // unaligned 128-bit stores because the negative impact that has shown in
4725 // practice on inlined block copy code.
4726 // We make such stores expensive so that we will only vectorize if there
4727 // are 6 other instructions getting vectorized.
4728 const int AmortizationCost = 6;
4729
4730 return LT.first * 2 * AmortizationCost;
4731 }
4732
4733 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
4734 if (Ty->isPtrOrPtrVectorTy())
4735 return LT.first;
4736
4737 if (useNeonVector(Ty)) {
4738 // Check truncating stores and extending loads.
4739 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
4740 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
4741 if (VT == MVT::v4i8)
4742 return 2;
4743 // Otherwise we need to scalarize.
4744 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
4745 }
4746 EVT EltVT = VT.getVectorElementType();
4747 unsigned EltSize = EltVT.getScalarSizeInBits();
4748 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
4749 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
4750 return LT.first;
4751 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
4752 // widening to v4i8, which produces suboptimal results.
4753 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
4754 return LT.first;
4755
4756 // Check non-power-of-2 loads/stores for legal vector element types with
4757 // NEON. Non-power-of-2 memory ops will get broken down to a set of
4758 // operations on smaller power-of-2 ops, including ld1/st1.
4759 LLVMContext &C = Ty->getContext();
4761 SmallVector<EVT> TypeWorklist;
4762 TypeWorklist.push_back(VT);
4763 while (!TypeWorklist.empty()) {
4764 EVT CurrVT = TypeWorklist.pop_back_val();
4765 unsigned CurrNumElements = CurrVT.getVectorNumElements();
4766 if (isPowerOf2_32(CurrNumElements)) {
4767 Cost += 1;
4768 continue;
4769 }
4770
4771 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
4772 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
4773 TypeWorklist.push_back(
4774 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
4775 }
4776 return Cost;
4777 }
4778
4779 return LT.first;
4780}
4781
4783 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
4784 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4785 bool UseMaskForCond, bool UseMaskForGaps) const {
4786 assert(Factor >= 2 && "Invalid interleave factor");
4787 auto *VecVTy = cast<VectorType>(VecTy);
4788
4789 if (VecTy->isScalableTy() && !ST->hasSVE())
4791
4792 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
4793 // only have lowering for power-of-2 factors.
4794 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
4795 // InterleavedAccessPass for ld3/st3
4796 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
4798
4799 // Vectorization for masked interleaved accesses is only enabled for scalable
4800 // VF.
4801 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
4803
4804 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
4805 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
4806 auto *SubVecTy =
4807 VectorType::get(VecVTy->getElementType(),
4808 VecVTy->getElementCount().divideCoefficientBy(Factor));
4809
4810 // ldN/stN only support legal vector types of size 64 or 128 in bits.
4811 // Accesses having vector types that are a multiple of 128 bits can be
4812 // matched to more than one ldN/stN instruction.
4813 bool UseScalable;
4814 if (MinElts % Factor == 0 &&
4815 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
4816 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
4817 }
4818
4819 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4820 Alignment, AddressSpace, CostKind,
4821 UseMaskForCond, UseMaskForGaps);
4822}
4823
4828 for (auto *I : Tys) {
4829 if (!I->isVectorTy())
4830 continue;
4831 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
4832 128)
4833 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
4834 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
4835 }
4836 return Cost;
4837}
4838
4840 return ST->getMaxInterleaveFactor();
4841}
4842
4843// For Falkor, we want to avoid having too many strided loads in a loop since
4844// that can exhaust the HW prefetcher resources. We adjust the unroller
4845// MaxCount preference below to attempt to ensure unrolling doesn't create too
4846// many strided loads.
4847static void
4850 enum { MaxStridedLoads = 7 };
4851 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
4852 int StridedLoads = 0;
4853 // FIXME? We could make this more precise by looking at the CFG and
4854 // e.g. not counting loads in each side of an if-then-else diamond.
4855 for (const auto BB : L->blocks()) {
4856 for (auto &I : *BB) {
4857 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
4858 if (!LMemI)
4859 continue;
4860
4861 Value *PtrValue = LMemI->getPointerOperand();
4862 if (L->isLoopInvariant(PtrValue))
4863 continue;
4864
4865 const SCEV *LSCEV = SE.getSCEV(PtrValue);
4866 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
4867 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
4868 continue;
4869
4870 // FIXME? We could take pairing of unrolled load copies into account
4871 // by looking at the AddRec, but we would probably have to limit this
4872 // to loops with no stores or other memory optimization barriers.
4873 ++StridedLoads;
4874 // We've seen enough strided loads that seeing more won't make a
4875 // difference.
4876 if (StridedLoads > MaxStridedLoads / 2)
4877 return StridedLoads;
4878 }
4879 }
4880 return StridedLoads;
4881 };
4882
4883 int StridedLoads = countStridedLoads(L, SE);
4884 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
4885 << " strided loads\n");
4886 // Pick the largest power of 2 unroll count that won't result in too many
4887 // strided loads.
4888 if (StridedLoads) {
4889 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
4890 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
4891 << UP.MaxCount << '\n');
4892 }
4893}
4894
4895// This function returns true if the loop:
4896// 1. Has a valid cost, and
4897// 2. Has a cost within the supplied budget.
4898// Otherwise it returns false.
4900 InstructionCost Budget,
4901 unsigned *FinalSize) {
4902 // Estimate the size of the loop.
4903 InstructionCost LoopCost = 0;
4904
4905 for (auto *BB : L->getBlocks()) {
4906 for (auto &I : *BB) {
4907 SmallVector<const Value *, 4> Operands(I.operand_values());
4908 InstructionCost Cost =
4909 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
4910 // This can happen with intrinsics that don't currently have a cost model
4911 // or for some operations that require SVE.
4912 if (!Cost.isValid())
4913 return false;
4914
4915 LoopCost += Cost;
4916 if (LoopCost > Budget)
4917 return false;
4918 }
4919 }
4920
4921 if (FinalSize)
4922 *FinalSize = LoopCost.getValue();
4923 return true;
4924}
4925
4927 const AArch64TTIImpl &TTI) {
4928 // Only consider loops with unknown trip counts for which we can determine
4929 // a symbolic expression. Multi-exit loops with small known trip counts will
4930 // likely be unrolled anyway.
4931 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
4933 return false;
4934
4935 // It might not be worth unrolling loops with low max trip counts. Restrict
4936 // this to max trip counts > 32 for now.
4937 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
4938 if (MaxTC > 0 && MaxTC <= 32)
4939 return false;
4940
4941 // Make sure the loop size is <= 5.
4942 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
4943 return false;
4944
4945 // Small search loops with multiple exits can be highly beneficial to unroll.
4946 // We only care about loops with exactly two exiting blocks, although each
4947 // block could jump to the same exit block.
4948 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
4949 if (Blocks.size() != 2)
4950 return false;
4951
4952 if (any_of(Blocks, [](BasicBlock *BB) {
4953 return !isa<BranchInst>(BB->getTerminator());
4954 }))
4955 return false;
4956
4957 return true;
4958}
4959
4960/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
4961/// OOO engine's wide instruction window and various predictors.
4962static void
4965 const AArch64TTIImpl &TTI) {
4966 // Limit loops with structure that is highly likely to benefit from runtime
4967 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
4968 // likely with complex control flow). Note that the heuristics here may be
4969 // overly conservative and we err on the side of avoiding runtime unrolling
4970 // rather than unroll excessively. They are all subject to further refinement.
4971 if (!L->isInnermost() || L->getNumBlocks() > 8)
4972 return;
4973
4974 // Loops with multiple exits are handled by common code.
4975 if (!L->getExitBlock())
4976 return;
4977
4978 // Check if the loop contains any reductions that could be parallelized when
4979 // unrolling. If so, enable partial unrolling, if the trip count is know to be
4980 // a multiple of 2.
4981 bool HasParellelizableReductions =
4982 L->getNumBlocks() == 1 &&
4983 any_of(L->getHeader()->phis(),
4984 [&SE, L](PHINode &Phi) {
4985 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
4986 }) &&
4987 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
4988 if (HasParellelizableReductions &&
4989 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
4990 UP.Partial = true;
4991 UP.MaxCount = 4;
4992 UP.AddAdditionalAccumulators = true;
4993 }
4994
4995 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
4997 (SE.getSmallConstantMaxTripCount(L) > 0 &&
4998 SE.getSmallConstantMaxTripCount(L) <= 32))
4999 return;
5000
5001 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5002 return;
5003
5005 return;
5006
5007 // Limit to loops with trip counts that are cheap to expand.
5008 UP.SCEVExpansionBudget = 1;
5009
5010 if (HasParellelizableReductions) {
5011 UP.Runtime = true;
5013 UP.AddAdditionalAccumulators = true;
5014 }
5015
5016 // Try to unroll small loops, of few-blocks with low budget, if they have
5017 // load/store dependencies, to expose more parallel memory access streams,
5018 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5019 BasicBlock *Header = L->getHeader();
5020 BasicBlock *Latch = L->getLoopLatch();
5021 if (Header == Latch) {
5022 // Estimate the size of the loop.
5023 unsigned Size;
5024 unsigned Width = 10;
5025 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5026 return;
5027
5028 // Try to find an unroll count that maximizes the use of the instruction
5029 // window, i.e. trying to fetch as many instructions per cycle as possible.
5030 unsigned MaxInstsPerLine = 16;
5031 unsigned UC = 1;
5032 unsigned BestUC = 1;
5033 unsigned SizeWithBestUC = BestUC * Size;
5034 while (UC <= 8) {
5035 unsigned SizeWithUC = UC * Size;
5036 if (SizeWithUC > 48)
5037 break;
5038 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5039 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5040 BestUC = UC;
5041 SizeWithBestUC = BestUC * Size;
5042 }
5043 UC++;
5044 }
5045
5046 if (BestUC == 1)
5047 return;
5048
5049 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5051 for (auto *BB : L->blocks()) {
5052 for (auto &I : *BB) {
5054 if (!Ptr)
5055 continue;
5056 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5057 if (SE.isLoopInvariant(PtrSCEV, L))
5058 continue;
5059 if (isa<LoadInst>(&I)) {
5060 LoadedValuesPlus.insert(&I);
5061 // Include in-loop 1st users of loaded values.
5062 for (auto *U : I.users())
5063 if (L->contains(cast<Instruction>(U)))
5064 LoadedValuesPlus.insert(U);
5065 } else
5066 Stores.push_back(cast<StoreInst>(&I));
5067 }
5068 }
5069
5070 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5071 return LoadedValuesPlus.contains(SI->getOperand(0));
5072 }))
5073 return;
5074
5075 UP.Runtime = true;
5076 UP.DefaultUnrollRuntimeCount = BestUC;
5077 return;
5078 }
5079
5080 // Try to runtime-unroll loops with early-continues depending on loop-varying
5081 // loads; this helps with branch-prediction for the early-continues.
5082 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
5084 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
5085 !llvm::is_contained(Preds, Header) ||
5086 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5087 return;
5088
5089 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5090 [&](Instruction *I, unsigned Depth) -> bool {
5091 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5092 return false;
5093
5094 if (isa<LoadInst>(I))
5095 return true;
5096
5097 return any_of(I->operands(), [&](Value *V) {
5098 auto *I = dyn_cast<Instruction>(V);
5099 return I && DependsOnLoopLoad(I, Depth + 1);
5100 });
5101 };
5102 CmpPredicate Pred;
5103 Instruction *I;
5104 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5105 m_Value())) &&
5106 DependsOnLoopLoad(I, 0)) {
5107 UP.Runtime = true;
5108 }
5109}
5110
5113 OptimizationRemarkEmitter *ORE) const {
5114 // Enable partial unrolling and runtime unrolling.
5115 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5116
5117 UP.UpperBound = true;
5118
5119 // For inner loop, it is more likely to be a hot one, and the runtime check
5120 // can be promoted out from LICM pass, so the overhead is less, let's try
5121 // a larger threshold to unroll more loops.
5122 if (L->getLoopDepth() > 1)
5123 UP.PartialThreshold *= 2;
5124
5125 // Disable partial & runtime unrolling on -Os.
5127
5128 // Scan the loop: don't unroll loops with calls as this could prevent
5129 // inlining. Don't unroll auto-vectorized loops either, though do allow
5130 // unrolling of the scalar remainder.
5131 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5132 for (auto *BB : L->getBlocks()) {
5133 for (auto &I : *BB) {
5134 // Both auto-vectorized loops and the scalar remainder have the
5135 // isvectorized attribute, so differentiate between them by the presence
5136 // of vector instructions.
5137 if (IsVectorized && I.getType()->isVectorTy())
5138 return;
5139 if (isa<CallBase>(I)) {
5142 if (!isLoweredToCall(F))
5143 continue;
5144 return;
5145 }
5146 }
5147 }
5148
5149 // Apply subtarget-specific unrolling preferences.
5150 switch (ST->getProcFamily()) {
5151 case AArch64Subtarget::AppleA14:
5152 case AArch64Subtarget::AppleA15:
5153 case AArch64Subtarget::AppleA16:
5154 case AArch64Subtarget::AppleM4:
5155 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5156 break;
5157 case AArch64Subtarget::Falkor:
5160 break;
5161 default:
5162 break;
5163 }
5164
5165 // If this is a small, multi-exit loop similar to something like std::find,
5166 // then there is typically a performance improvement achieved by unrolling.
5167 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5168 UP.RuntimeUnrollMultiExit = true;
5169 UP.Runtime = true;
5170 // Limit unroll count.
5172 // Allow slightly more costly trip-count expansion to catch search loops
5173 // with pointer inductions.
5174 UP.SCEVExpansionBudget = 5;
5175 return;
5176 }
5177
5178 // Enable runtime unrolling for in-order models
5179 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5180 // checking for that case, we can ensure that the default behaviour is
5181 // unchanged
5182 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5183 !ST->getSchedModel().isOutOfOrder()) {
5184 UP.Runtime = true;
5185 UP.Partial = true;
5186 UP.UnrollRemainder = true;
5188
5189 UP.UnrollAndJam = true;
5191 }
5192}
5193
5198
5200 Type *ExpectedType,
5201 bool CanCreate) const {
5202 switch (Inst->getIntrinsicID()) {
5203 default:
5204 return nullptr;
5205 case Intrinsic::aarch64_neon_st2:
5206 case Intrinsic::aarch64_neon_st3:
5207 case Intrinsic::aarch64_neon_st4: {
5208 // Create a struct type
5209 StructType *ST = dyn_cast<StructType>(ExpectedType);
5210 if (!CanCreate || !ST)
5211 return nullptr;
5212 unsigned NumElts = Inst->arg_size() - 1;
5213 if (ST->getNumElements() != NumElts)
5214 return nullptr;
5215 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5216 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5217 return nullptr;
5218 }
5219 Value *Res = PoisonValue::get(ExpectedType);
5220 IRBuilder<> Builder(Inst);
5221 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5222 Value *L = Inst->getArgOperand(i);
5223 Res = Builder.CreateInsertValue(Res, L, i);
5224 }
5225 return Res;
5226 }
5227 case Intrinsic::aarch64_neon_ld2:
5228 case Intrinsic::aarch64_neon_ld3:
5229 case Intrinsic::aarch64_neon_ld4:
5230 if (Inst->getType() == ExpectedType)
5231 return Inst;
5232 return nullptr;
5233 }
5234}
5235
5237 MemIntrinsicInfo &Info) const {
5238 switch (Inst->getIntrinsicID()) {
5239 default:
5240 break;
5241 case Intrinsic::aarch64_neon_ld2:
5242 case Intrinsic::aarch64_neon_ld3:
5243 case Intrinsic::aarch64_neon_ld4:
5244 Info.ReadMem = true;
5245 Info.WriteMem = false;
5246 Info.PtrVal = Inst->getArgOperand(0);
5247 break;
5248 case Intrinsic::aarch64_neon_st2:
5249 case Intrinsic::aarch64_neon_st3:
5250 case Intrinsic::aarch64_neon_st4:
5251 Info.ReadMem = false;
5252 Info.WriteMem = true;
5253 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5254 break;
5255 }
5256
5257 switch (Inst->getIntrinsicID()) {
5258 default:
5259 return false;
5260 case Intrinsic::aarch64_neon_ld2:
5261 case Intrinsic::aarch64_neon_st2:
5262 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5263 break;
5264 case Intrinsic::aarch64_neon_ld3:
5265 case Intrinsic::aarch64_neon_st3:
5266 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5267 break;
5268 case Intrinsic::aarch64_neon_ld4:
5269 case Intrinsic::aarch64_neon_st4:
5270 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5271 break;
5272 }
5273 return true;
5274}
5275
5276/// See if \p I should be considered for address type promotion. We check if \p
5277/// I is a sext with right type and used in memory accesses. If it used in a
5278/// "complex" getelementptr, we allow it to be promoted without finding other
5279/// sext instructions that sign extended the same initial value. A getelementptr
5280/// is considered as "complex" if it has more than 2 operands.
5282 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5283 bool Considerable = false;
5284 AllowPromotionWithoutCommonHeader = false;
5285 if (!isa<SExtInst>(&I))
5286 return false;
5287 Type *ConsideredSExtType =
5288 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5289 if (I.getType() != ConsideredSExtType)
5290 return false;
5291 // See if the sext is the one with the right type and used in at least one
5292 // GetElementPtrInst.
5293 for (const User *U : I.users()) {
5294 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5295 Considerable = true;
5296 // A getelementptr is considered as "complex" if it has more than 2
5297 // operands. We will promote a SExt used in such complex GEP as we
5298 // expect some computation to be merged if they are done on 64 bits.
5299 if (GEPInst->getNumOperands() > 2) {
5300 AllowPromotionWithoutCommonHeader = true;
5301 break;
5302 }
5303 }
5304 }
5305 return Considerable;
5306}
5307
5309 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5310 if (!VF.isScalable())
5311 return true;
5312
5313 Type *Ty = RdxDesc.getRecurrenceType();
5314 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5315 return false;
5316
5317 switch (RdxDesc.getRecurrenceKind()) {
5318 case RecurKind::Sub:
5320 case RecurKind::Add:
5321 case RecurKind::FAdd:
5322 case RecurKind::And:
5323 case RecurKind::Or:
5324 case RecurKind::Xor:
5325 case RecurKind::SMin:
5326 case RecurKind::SMax:
5327 case RecurKind::UMin:
5328 case RecurKind::UMax:
5329 case RecurKind::FMin:
5330 case RecurKind::FMax:
5331 case RecurKind::FMulAdd:
5332 case RecurKind::AnyOf:
5333 return true;
5334 default:
5335 return false;
5336 }
5337}
5338
5341 FastMathFlags FMF,
5343 // The code-generator is currently not able to handle scalable vectors
5344 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5345 // it. This change will be removed when code-generation for these types is
5346 // sufficiently reliable.
5347 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5348 if (VTy->getElementCount() == ElementCount::getScalable(1))
5350
5351 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5352
5353 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5354 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5355
5356 InstructionCost LegalizationCost = 0;
5357 if (LT.first > 1) {
5358 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5359 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5360 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5361 }
5362
5363 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5364}
5365
5367 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5368 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5369 InstructionCost LegalizationCost = 0;
5370 if (LT.first > 1) {
5371 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5372 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5373 LegalizationCost *= LT.first - 1;
5374 }
5375
5376 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5377 assert(ISD && "Invalid opcode");
5378 // Add the final reduction cost for the legal horizontal reduction
5379 switch (ISD) {
5380 case ISD::ADD:
5381 case ISD::AND:
5382 case ISD::OR:
5383 case ISD::XOR:
5384 case ISD::FADD:
5385 return LegalizationCost + 2;
5386 default:
5388 }
5389}
5390
5393 std::optional<FastMathFlags> FMF,
5395 // The code-generator is currently not able to handle scalable vectors
5396 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5397 // it. This change will be removed when code-generation for these types is
5398 // sufficiently reliable.
5399 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5400 if (VTy->getElementCount() == ElementCount::getScalable(1))
5402
5404 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5405 InstructionCost BaseCost =
5406 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5407 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5408 // end up vectorizing for more computationally intensive loops.
5409 return BaseCost + FixedVTy->getNumElements();
5410 }
5411
5412 if (Opcode != Instruction::FAdd)
5414
5415 auto *VTy = cast<ScalableVectorType>(ValTy);
5417 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5418 Cost *= getMaxNumElements(VTy->getElementCount());
5419 return Cost;
5420 }
5421
5422 if (isa<ScalableVectorType>(ValTy))
5423 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5424
5425 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5426 MVT MTy = LT.second;
5427 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5428 assert(ISD && "Invalid opcode");
5429
5430 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5431 // instructions as twice a normal vector add, plus 1 for each legalization
5432 // step (LT.first). This is the only arithmetic vector reduction operation for
5433 // which we have an instruction.
5434 // OR, XOR and AND costs should match the codegen from:
5435 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5436 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5437 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5438 static const CostTblEntry CostTblNoPairwise[]{
5439 {ISD::ADD, MVT::v8i8, 2},
5440 {ISD::ADD, MVT::v16i8, 2},
5441 {ISD::ADD, MVT::v4i16, 2},
5442 {ISD::ADD, MVT::v8i16, 2},
5443 {ISD::ADD, MVT::v2i32, 2},
5444 {ISD::ADD, MVT::v4i32, 2},
5445 {ISD::ADD, MVT::v2i64, 2},
5446 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5447 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5448 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5449 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5450 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5451 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5452 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5453 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5454 {ISD::XOR, MVT::v16i8, 7},
5455 {ISD::XOR, MVT::v4i16, 4},
5456 {ISD::XOR, MVT::v8i16, 6},
5457 {ISD::XOR, MVT::v2i32, 3},
5458 {ISD::XOR, MVT::v4i32, 5},
5459 {ISD::XOR, MVT::v2i64, 3},
5460 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5461 {ISD::AND, MVT::v16i8, 7},
5462 {ISD::AND, MVT::v4i16, 4},
5463 {ISD::AND, MVT::v8i16, 6},
5464 {ISD::AND, MVT::v2i32, 3},
5465 {ISD::AND, MVT::v4i32, 5},
5466 {ISD::AND, MVT::v2i64, 3},
5467 };
5468 switch (ISD) {
5469 default:
5470 break;
5471 case ISD::FADD:
5472 if (Type *EltTy = ValTy->getScalarType();
5473 // FIXME: For half types without fullfp16 support, this could extend and
5474 // use a fp32 faddp reduction but current codegen unrolls.
5475 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5476 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5477 const unsigned NElts = MTy.getVectorNumElements();
5478 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5479 isPowerOf2_32(NElts))
5480 // Reduction corresponding to series of fadd instructions is lowered to
5481 // series of faddp instructions. faddp has latency/throughput that
5482 // matches fadd instruction and hence, every faddp instruction can be
5483 // considered to have a relative cost = 1 with
5484 // CostKind = TCK_RecipThroughput.
5485 // An faddp will pairwise add vector elements, so the size of input
5486 // vector reduces by half every time, requiring
5487 // #(faddp instructions) = log2_32(NElts).
5488 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5489 }
5490 break;
5491 case ISD::ADD:
5492 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5493 return (LT.first - 1) + Entry->Cost;
5494 break;
5495 case ISD::XOR:
5496 case ISD::AND:
5497 case ISD::OR:
5498 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5499 if (!Entry)
5500 break;
5501 auto *ValVTy = cast<FixedVectorType>(ValTy);
5502 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5503 isPowerOf2_32(ValVTy->getNumElements())) {
5504 InstructionCost ExtraCost = 0;
5505 if (LT.first != 1) {
5506 // Type needs to be split, so there is an extra cost of LT.first - 1
5507 // arithmetic ops.
5508 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5509 MTy.getVectorNumElements());
5510 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5511 ExtraCost *= LT.first - 1;
5512 }
5513 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5514 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5515 return Cost + ExtraCost;
5516 }
5517 break;
5518 }
5519 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5520}
5521
5523 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5524 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5525 EVT VecVT = TLI->getValueType(DL, VecTy);
5526 EVT ResVT = TLI->getValueType(DL, ResTy);
5527
5528 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5529 VecVT.getSizeInBits() >= 64) {
5530 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5531
5532 // The legal cases are:
5533 // UADDLV 8/16/32->32
5534 // UADDLP 32->64
5535 unsigned RevVTSize = ResVT.getSizeInBits();
5536 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5537 RevVTSize <= 32) ||
5538 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5539 RevVTSize <= 32) ||
5540 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5541 RevVTSize <= 64))
5542 return (LT.first - 1) * 2 + 2;
5543 }
5544
5545 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5546 CostKind);
5547}
5548
5550AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5551 Type *ResTy, VectorType *VecTy,
5553 EVT VecVT = TLI->getValueType(DL, VecTy);
5554 EVT ResVT = TLI->getValueType(DL, ResTy);
5555
5556 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5557 RedOpcode == Instruction::Add) {
5558 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5559
5560 // The legal cases with dotprod are
5561 // UDOT 8->32
5562 // Which requires an additional uaddv to sum the i32 values.
5563 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5564 ResVT == MVT::i32)
5565 return LT.first + 2;
5566 }
5567
5568 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
5569 CostKind);
5570}
5571
5575 static const CostTblEntry ShuffleTbl[] = {
5576 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5577 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5578 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5579 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5580 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5581 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5582 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5583 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5584 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5585 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5586 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5587 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5588 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5589 };
5590
5591 // The code-generator is currently not able to handle scalable vectors
5592 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5593 // it. This change will be removed when code-generation for these types is
5594 // sufficiently reliable.
5597
5598 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
5599 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
5600 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5601 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5602 : LT.second;
5603 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5604 InstructionCost LegalizationCost = 0;
5605 if (Index < 0) {
5606 LegalizationCost =
5607 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5609 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5611 }
5612
5613 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5614 // Cost performed on a promoted type.
5615 if (LT.second.getScalarType() == MVT::i1) {
5616 LegalizationCost +=
5617 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5619 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
5621 }
5622 const auto *Entry =
5623 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
5624 assert(Entry && "Illegal Type for Splice");
5625 LegalizationCost += Entry->Cost;
5626 return LegalizationCost * LT.first;
5627}
5628
5630 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5632 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5636
5638 return Invalid;
5639
5640 // Sub opcodes currently only occur in chained cases.
5641 // Independent partial reduction subtractions are still costed as an add
5642 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) ||
5643 OpAExtend == TTI::PR_None)
5644 return Invalid;
5645
5646 // We only support multiply binary operations for now, and for muls we
5647 // require the types being extended to be the same.
5648 // NOTE: For muls AArch64 supports lowering mixed extensions to a usdot but
5649 // only if the i8mm or sve/streaming features are available.
5650 if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB ||
5651 OpBExtend == TTI::PR_None ||
5652 (OpAExtend != OpBExtend && !ST->hasMatMulInt8() &&
5653 !ST->isSVEorStreamingSVEAvailable())))
5654 return Invalid;
5655 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5656 "Unexpected values for OpBExtend or InputTypeB");
5657
5658 EVT InputEVT = EVT::getEVT(InputTypeA);
5659 EVT AccumEVT = EVT::getEVT(AccumType);
5660
5661 unsigned VFMinValue = VF.getKnownMinValue();
5662
5663 if (VF.isScalable()) {
5664 if (!ST->isSVEorStreamingSVEAvailable())
5665 return Invalid;
5666
5667 // Don't accept a partial reduction if the scaled accumulator is vscale x 1,
5668 // since we can't lower that type.
5669 unsigned Scale =
5670 AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits();
5671 if (VFMinValue == Scale)
5672 return Invalid;
5673 }
5674 if (VF.isFixed() &&
5675 (!ST->isNeonAvailable() || !ST->hasDotProd() || AccumEVT == MVT::i64))
5676 return Invalid;
5677
5678 if (InputEVT == MVT::i8) {
5679 switch (VFMinValue) {
5680 default:
5681 return Invalid;
5682 case 8:
5683 if (AccumEVT == MVT::i32)
5684 Cost *= 2;
5685 else if (AccumEVT != MVT::i64)
5686 return Invalid;
5687 break;
5688 case 16:
5689 if (AccumEVT == MVT::i64)
5690 Cost *= 2;
5691 else if (AccumEVT != MVT::i32)
5692 return Invalid;
5693 break;
5694 }
5695 } else if (InputEVT == MVT::i16) {
5696 // FIXME: Allow i32 accumulator but increase cost, as we would extend
5697 // it to i64.
5698 if (VFMinValue != 8 || AccumEVT != MVT::i64)
5699 return Invalid;
5700 } else
5701 return Invalid;
5702
5703 return Cost;
5704}
5705
5708 VectorType *SrcTy, ArrayRef<int> Mask,
5709 TTI::TargetCostKind CostKind, int Index,
5711 const Instruction *CxtI) const {
5712 assert((Mask.empty() || DstTy->isScalableTy() ||
5713 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
5714 "Expected the Mask to match the return size if given");
5715 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
5716 "Expected the same scalar types");
5717 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
5718
5719 // If we have a Mask, and the LT is being legalized somehow, split the Mask
5720 // into smaller vectors and sum the cost of each shuffle.
5721 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
5722 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
5723 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
5724 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
5725 // Check for LD3/LD4 instructions, which are represented in llvm IR as
5726 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
5727 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
5728 // cost than just the load.
5729 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
5732 return std::max<InstructionCost>(1, LT.first / 4);
5733
5734 // Check for ST3/ST4 instructions, which are represented in llvm IR as
5735 // store(interleaving-shuffle). The shuffle cost could potentially be free,
5736 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
5737 // cost than just the store.
5738 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
5740 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
5742 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
5743 return LT.first;
5744
5745 unsigned TpNumElts = Mask.size();
5746 unsigned LTNumElts = LT.second.getVectorNumElements();
5747 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
5748 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
5749 LT.second.getVectorElementCount());
5751 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
5752 PreviousCosts;
5753 for (unsigned N = 0; N < NumVecs; N++) {
5754 SmallVector<int> NMask;
5755 // Split the existing mask into chunks of size LTNumElts. Track the source
5756 // sub-vectors to ensure the result has at most 2 inputs.
5757 unsigned Source1 = -1U, Source2 = -1U;
5758 unsigned NumSources = 0;
5759 for (unsigned E = 0; E < LTNumElts; E++) {
5760 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
5762 if (MaskElt < 0) {
5764 continue;
5765 }
5766
5767 // Calculate which source from the input this comes from and whether it
5768 // is new to us.
5769 unsigned Source = MaskElt / LTNumElts;
5770 if (NumSources == 0) {
5771 Source1 = Source;
5772 NumSources = 1;
5773 } else if (NumSources == 1 && Source != Source1) {
5774 Source2 = Source;
5775 NumSources = 2;
5776 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
5777 NumSources++;
5778 }
5779
5780 // Add to the new mask. For the NumSources>2 case these are not correct,
5781 // but are only used for the modular lane number.
5782 if (Source == Source1)
5783 NMask.push_back(MaskElt % LTNumElts);
5784 else if (Source == Source2)
5785 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
5786 else
5787 NMask.push_back(MaskElt % LTNumElts);
5788 }
5789 // Check if we have already generated this sub-shuffle, which means we
5790 // will have already generated the output. For example a <16 x i32> splat
5791 // will be the same sub-splat 4 times, which only needs to be generated
5792 // once and reused.
5793 auto Result =
5794 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
5795 // Check if it was already in the map (already costed).
5796 if (!Result.second)
5797 continue;
5798 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
5799 // getShuffleCost. If not then cost it using the worst case as the number
5800 // of element moves into a new vector.
5801 InstructionCost NCost =
5802 NumSources <= 2
5803 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
5805 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
5806 CxtI)
5807 : LTNumElts;
5808 Result.first->second = NCost;
5809 Cost += NCost;
5810 }
5811 return Cost;
5812 }
5813
5814 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
5815 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
5816 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
5817 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
5818 // This currently only handles low or high extracts to prevent SLP vectorizer
5819 // regressions.
5820 // Note that SVE's ext instruction is destructive, but it can be fused with
5821 // a movprfx to act like a constructive instruction.
5822 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
5823 if (LT.second.getFixedSizeInBits() >= 128 &&
5824 cast<FixedVectorType>(SubTp)->getNumElements() ==
5825 LT.second.getVectorNumElements() / 2) {
5826 if (Index == 0)
5827 return 0;
5828 if (Index == (int)LT.second.getVectorNumElements() / 2)
5829 return 1;
5830 }
5832 }
5833 // FIXME: This was added to keep the costs equal when adding DstTys. Update
5834 // the code to handle length-changing shuffles.
5835 if (Kind == TTI::SK_InsertSubvector) {
5836 LT = getTypeLegalizationCost(DstTy);
5837 SrcTy = DstTy;
5838 }
5839
5840 // Segmented shuffle matching.
5841 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
5842 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
5843 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
5845
5847 unsigned Segments =
5849 unsigned SegmentElts = VTy->getNumElements() / Segments;
5850
5851 // dupq zd.t, zn.t[idx]
5852 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
5853 ST->isSVEorStreamingSVEAvailable() &&
5854 isDUPQMask(Mask, Segments, SegmentElts))
5855 return LT.first;
5856
5857 // mov zd.q, vn
5858 if (ST->isSVEorStreamingSVEAvailable() &&
5859 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
5860 return LT.first;
5861 }
5862
5863 // Check for broadcast loads, which are supported by the LD1R instruction.
5864 // In terms of code-size, the shuffle vector is free when a load + dup get
5865 // folded into a LD1R. That's what we check and return here. For performance
5866 // and reciprocal throughput, a LD1R is not completely free. In this case, we
5867 // return the cost for the broadcast below (i.e. 1 for most/all types), so
5868 // that we model the load + dup sequence slightly higher because LD1R is a
5869 // high latency instruction.
5870 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
5871 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
5872 if (IsLoad && LT.second.isVector() &&
5873 isLegalBroadcastLoad(SrcTy->getElementType(),
5874 LT.second.getVectorElementCount()))
5875 return 0;
5876 }
5877
5878 // If we have 4 elements for the shuffle and a Mask, get the cost straight
5879 // from the perfect shuffle tables.
5880 if (Mask.size() == 4 &&
5881 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
5882 (SrcTy->getScalarSizeInBits() == 16 ||
5883 SrcTy->getScalarSizeInBits() == 32) &&
5884 all_of(Mask, [](int E) { return E < 8; }))
5885 return getPerfectShuffleCost(Mask);
5886
5887 // Check for identity masks, which we can treat as free.
5888 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
5889 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
5890 all_of(enumerate(Mask), [](const auto &M) {
5891 return M.value() < 0 || M.value() == (int)M.index();
5892 }))
5893 return 0;
5894
5895 // Check for other shuffles that are not SK_ kinds but we have native
5896 // instructions for, for example ZIP and UZP.
5897 unsigned Unused;
5898 if (LT.second.isFixedLengthVector() &&
5899 LT.second.getVectorNumElements() == Mask.size() &&
5900 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
5901 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
5902 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
5903 isREVMask(Mask, LT.second.getScalarSizeInBits(),
5904 LT.second.getVectorNumElements(), 16) ||
5905 isREVMask(Mask, LT.second.getScalarSizeInBits(),
5906 LT.second.getVectorNumElements(), 32) ||
5907 isREVMask(Mask, LT.second.getScalarSizeInBits(),
5908 LT.second.getVectorNumElements(), 64) ||
5909 // Check for non-zero lane splats
5910 all_of(drop_begin(Mask),
5911 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
5912 return 1;
5913
5914 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
5915 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
5916 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
5917 static const CostTblEntry ShuffleTbl[] = {
5918 // Broadcast shuffle kinds can be performed with 'dup'.
5919 {TTI::SK_Broadcast, MVT::v8i8, 1},
5920 {TTI::SK_Broadcast, MVT::v16i8, 1},
5921 {TTI::SK_Broadcast, MVT::v4i16, 1},
5922 {TTI::SK_Broadcast, MVT::v8i16, 1},
5923 {TTI::SK_Broadcast, MVT::v2i32, 1},
5924 {TTI::SK_Broadcast, MVT::v4i32, 1},
5925 {TTI::SK_Broadcast, MVT::v2i64, 1},
5926 {TTI::SK_Broadcast, MVT::v4f16, 1},
5927 {TTI::SK_Broadcast, MVT::v8f16, 1},
5928 {TTI::SK_Broadcast, MVT::v4bf16, 1},
5929 {TTI::SK_Broadcast, MVT::v8bf16, 1},
5930 {TTI::SK_Broadcast, MVT::v2f32, 1},
5931 {TTI::SK_Broadcast, MVT::v4f32, 1},
5932 {TTI::SK_Broadcast, MVT::v2f64, 1},
5933 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
5934 // 'zip1/zip2' instructions.
5935 {TTI::SK_Transpose, MVT::v8i8, 1},
5936 {TTI::SK_Transpose, MVT::v16i8, 1},
5937 {TTI::SK_Transpose, MVT::v4i16, 1},
5938 {TTI::SK_Transpose, MVT::v8i16, 1},
5939 {TTI::SK_Transpose, MVT::v2i32, 1},
5940 {TTI::SK_Transpose, MVT::v4i32, 1},
5941 {TTI::SK_Transpose, MVT::v2i64, 1},
5942 {TTI::SK_Transpose, MVT::v4f16, 1},
5943 {TTI::SK_Transpose, MVT::v8f16, 1},
5944 {TTI::SK_Transpose, MVT::v4bf16, 1},
5945 {TTI::SK_Transpose, MVT::v8bf16, 1},
5946 {TTI::SK_Transpose, MVT::v2f32, 1},
5947 {TTI::SK_Transpose, MVT::v4f32, 1},
5948 {TTI::SK_Transpose, MVT::v2f64, 1},
5949 // Select shuffle kinds.
5950 // TODO: handle vXi8/vXi16.
5951 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
5952 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
5953 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
5954 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
5955 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
5956 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
5957 // PermuteSingleSrc shuffle kinds.
5958 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
5959 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
5960 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
5961 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
5962 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
5963 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
5964 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
5965 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
5966 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
5967 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
5968 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
5969 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
5970 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
5971 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
5972 // Reverse can be lowered with `rev`.
5973 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
5974 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
5975 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
5976 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
5977 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
5978 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
5979 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
5980 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
5981 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
5982 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
5983 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
5984 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
5985 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
5986 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
5987 // Splice can all be lowered as `ext`.
5988 {TTI::SK_Splice, MVT::v2i32, 1},
5989 {TTI::SK_Splice, MVT::v4i32, 1},
5990 {TTI::SK_Splice, MVT::v2i64, 1},
5991 {TTI::SK_Splice, MVT::v2f32, 1},
5992 {TTI::SK_Splice, MVT::v4f32, 1},
5993 {TTI::SK_Splice, MVT::v2f64, 1},
5994 {TTI::SK_Splice, MVT::v8f16, 1},
5995 {TTI::SK_Splice, MVT::v8bf16, 1},
5996 {TTI::SK_Splice, MVT::v8i16, 1},
5997 {TTI::SK_Splice, MVT::v16i8, 1},
5998 {TTI::SK_Splice, MVT::v4f16, 1},
5999 {TTI::SK_Splice, MVT::v4bf16, 1},
6000 {TTI::SK_Splice, MVT::v4i16, 1},
6001 {TTI::SK_Splice, MVT::v8i8, 1},
6002 // Broadcast shuffle kinds for scalable vectors
6003 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6004 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6005 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6006 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6007 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6008 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6009 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6010 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6011 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6012 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6013 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6014 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6015 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6016 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6017 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6018 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6019 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6020 // Handle the cases for vector.reverse with scalable vectors
6021 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6022 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6023 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6024 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6025 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6026 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6027 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6028 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6029 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6030 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6031 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6032 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6033 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6034 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6035 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6036 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6037 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6038 };
6039 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6040 return LT.first * Entry->Cost;
6041 }
6042
6043 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6044 return getSpliceCost(SrcTy, Index, CostKind);
6045
6046 // Inserting a subvector can often be done with either a D, S or H register
6047 // move, so long as the inserted vector is "aligned".
6048 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6049 LT.second.getSizeInBits() <= 128 && SubTp) {
6050 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6051 if (SubLT.second.isVector()) {
6052 int NumElts = LT.second.getVectorNumElements();
6053 int NumSubElts = SubLT.second.getVectorNumElements();
6054 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6055 return SubLT.first;
6056 }
6057 }
6058
6059 // Restore optimal kind.
6060 if (IsExtractSubvector)
6062 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6063 Args, CxtI);
6064}
6065
6068 const auto &Strides = DenseMap<Value *, const SCEV *>();
6069 for (BasicBlock *BB : TheLoop->blocks()) {
6070 // Scan the instructions in the block and look for addresses that are
6071 // consecutive and decreasing.
6072 for (Instruction &I : *BB) {
6073 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6075 Type *AccessTy = getLoadStoreType(&I);
6076 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
6077 /*ShouldCheckWrap=*/false)
6078 .value_or(0) < 0)
6079 return true;
6080 }
6081 }
6082 }
6083 return false;
6084}
6085
6087 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6089 // For cases like post-LTO vectorization, when we eventually know the trip
6090 // count, epilogue with fixed-width vectorization can be deleted if the trip
6091 // count is less than the epilogue iterations. That's why we prefer
6092 // fixed-width vectorization in epilogue in case of equal costs.
6093 if (IsEpilogue)
6094 return true;
6095 return ST->useFixedOverScalableIfEqualCost();
6096}
6097
6099 return ST->getEpilogueVectorizationMinVF();
6100}
6101
6103 if (!ST->hasSVE())
6104 return false;
6105
6106 // We don't currently support vectorisation with interleaving for SVE - with
6107 // such loops we're better off not using tail-folding. This gives us a chance
6108 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6109 if (TFI->IAI->hasGroups())
6110 return false;
6111
6113 if (TFI->LVL->getReductionVars().size())
6115 if (TFI->LVL->getFixedOrderRecurrences().size())
6117
6118 // We call this to discover whether any load/store pointers in the loop have
6119 // negative strides. This will require extra work to reverse the loop
6120 // predicate, which may be expensive.
6126
6127 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6128 Required))
6129 return false;
6130
6131 // Don't tail-fold for tight loops where we would be better off interleaving
6132 // with an unpredicated loop.
6133 unsigned NumInsns = 0;
6134 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6135 NumInsns += BB->sizeWithoutDebug();
6136 }
6137
6138 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6139 return NumInsns >= SVETailFoldInsnThreshold;
6140}
6141
6144 StackOffset BaseOffset, bool HasBaseReg,
6145 int64_t Scale, unsigned AddrSpace) const {
6146 // Scaling factors are not free at all.
6147 // Operands | Rt Latency
6148 // -------------------------------------------
6149 // Rt, [Xn, Xm] | 4
6150 // -------------------------------------------
6151 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6152 // Rt, [Xn, Wm, <extend> #imm] |
6154 AM.BaseGV = BaseGV;
6155 AM.BaseOffs = BaseOffset.getFixed();
6156 AM.HasBaseReg = HasBaseReg;
6157 AM.Scale = Scale;
6158 AM.ScalableOffset = BaseOffset.getScalable();
6159 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6160 // Scale represents reg2 * scale, thus account for 1 if
6161 // it is not equal to 0 or 1.
6162 return AM.Scale != 0 && AM.Scale != 1;
6164}
6165
6167 const Instruction *I) const {
6169 // For the binary operators (e.g. or) we need to be more careful than
6170 // selects, here we only transform them if they are already at a natural
6171 // break point in the code - the end of a block with an unconditional
6172 // terminator.
6173 if (I->getOpcode() == Instruction::Or &&
6174 isa<BranchInst>(I->getNextNode()) &&
6175 cast<BranchInst>(I->getNextNode())->isUnconditional())
6176 return true;
6177
6178 if (I->getOpcode() == Instruction::Add ||
6179 I->getOpcode() == Instruction::Sub)
6180 return true;
6181 }
6183}
6184
6187 const TargetTransformInfo::LSRCost &C2) const {
6188 // AArch64 specific here is adding the number of instructions to the
6189 // comparison (though not as the first consideration, as some targets do)
6190 // along with changing the priority of the base additions.
6191 // TODO: Maybe a more nuanced tradeoff between instruction count
6192 // and number of registers? To be investigated at a later date.
6193 if (EnableLSRCostOpt)
6194 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6195 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6196 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6197 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6198
6200}
6201
6202static bool isSplatShuffle(Value *V) {
6203 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6204 return all_equal(Shuf->getShuffleMask());
6205 return false;
6206}
6207
6208/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6209/// or upper half of the vector elements.
6210static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6211 bool AllowSplat = false) {
6212 // Scalable types can't be extract shuffle vectors.
6213 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6214 return false;
6215
6216 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6217 auto *FullTy = FullV->getType();
6218 auto *HalfTy = HalfV->getType();
6219 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6220 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6221 };
6222
6223 auto extractHalf = [](Value *FullV, Value *HalfV) {
6224 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6225 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6226 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6227 };
6228
6229 ArrayRef<int> M1, M2;
6230 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6231 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6232 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6233 return false;
6234
6235 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6236 // it is not checked as an extract below.
6237 if (AllowSplat && isSplatShuffle(Op1))
6238 S1Op1 = nullptr;
6239 if (AllowSplat && isSplatShuffle(Op2))
6240 S2Op1 = nullptr;
6241
6242 // Check that the operands are half as wide as the result and we extract
6243 // half of the elements of the input vectors.
6244 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6245 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6246 return false;
6247
6248 // Check the mask extracts either the lower or upper half of vector
6249 // elements.
6250 int M1Start = 0;
6251 int M2Start = 0;
6252 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6253 if ((S1Op1 &&
6254 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6255 (S2Op1 &&
6256 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6257 return false;
6258
6259 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6260 (M2Start != 0 && M2Start != (NumElements / 2)))
6261 return false;
6262 if (S1Op1 && S2Op1 && M1Start != M2Start)
6263 return false;
6264
6265 return true;
6266}
6267
6268/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6269/// of the vector elements.
6270static bool areExtractExts(Value *Ext1, Value *Ext2) {
6271 auto areExtDoubled = [](Instruction *Ext) {
6272 return Ext->getType()->getScalarSizeInBits() ==
6273 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6274 };
6275
6276 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6277 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6278 !areExtDoubled(cast<Instruction>(Ext1)) ||
6279 !areExtDoubled(cast<Instruction>(Ext2)))
6280 return false;
6281
6282 return true;
6283}
6284
6285/// Check if Op could be used with vmull_high_p64 intrinsic.
6287 Value *VectorOperand = nullptr;
6288 ConstantInt *ElementIndex = nullptr;
6289 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6290 m_ConstantInt(ElementIndex))) &&
6291 ElementIndex->getValue() == 1 &&
6292 isa<FixedVectorType>(VectorOperand->getType()) &&
6293 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6294}
6295
6296/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6297static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6299}
6300
6302 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6303 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6304 if (!GEP || GEP->getNumOperands() != 2)
6305 return false;
6306
6307 Value *Base = GEP->getOperand(0);
6308 Value *Offsets = GEP->getOperand(1);
6309
6310 // We only care about scalar_base+vector_offsets.
6311 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6312 return false;
6313
6314 // Sink extends that would allow us to use 32-bit offset vectors.
6315 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6316 auto *OffsetsInst = cast<Instruction>(Offsets);
6317 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6318 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6319 Ops.push_back(&GEP->getOperandUse(1));
6320 }
6321
6322 // Sink the GEP.
6323 return true;
6324}
6325
6326/// We want to sink following cases:
6327/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6328/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6330 if (match(Op, m_VScale()))
6331 return true;
6332 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6334 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6335 return true;
6336 }
6337 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6339 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6340 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6341 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6342 return true;
6343 }
6344 return false;
6345}
6346
6347/// Check if sinking \p I's operands to I's basic block is profitable, because
6348/// the operands can be folded into a target instruction, e.g.
6349/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6353 switch (II->getIntrinsicID()) {
6354 case Intrinsic::aarch64_neon_smull:
6355 case Intrinsic::aarch64_neon_umull:
6356 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6357 /*AllowSplat=*/true)) {
6358 Ops.push_back(&II->getOperandUse(0));
6359 Ops.push_back(&II->getOperandUse(1));
6360 return true;
6361 }
6362 [[fallthrough]];
6363
6364 case Intrinsic::fma:
6365 case Intrinsic::fmuladd:
6366 if (isa<VectorType>(I->getType()) &&
6367 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6368 !ST->hasFullFP16())
6369 return false;
6370 [[fallthrough]];
6371 case Intrinsic::aarch64_neon_sqdmull:
6372 case Intrinsic::aarch64_neon_sqdmulh:
6373 case Intrinsic::aarch64_neon_sqrdmulh:
6374 // Sink splats for index lane variants
6375 if (isSplatShuffle(II->getOperand(0)))
6376 Ops.push_back(&II->getOperandUse(0));
6377 if (isSplatShuffle(II->getOperand(1)))
6378 Ops.push_back(&II->getOperandUse(1));
6379 return !Ops.empty();
6380 case Intrinsic::aarch64_neon_fmlal:
6381 case Intrinsic::aarch64_neon_fmlal2:
6382 case Intrinsic::aarch64_neon_fmlsl:
6383 case Intrinsic::aarch64_neon_fmlsl2:
6384 // Sink splats for index lane variants
6385 if (isSplatShuffle(II->getOperand(1)))
6386 Ops.push_back(&II->getOperandUse(1));
6387 if (isSplatShuffle(II->getOperand(2)))
6388 Ops.push_back(&II->getOperandUse(2));
6389 return !Ops.empty();
6390 case Intrinsic::aarch64_sve_ptest_first:
6391 case Intrinsic::aarch64_sve_ptest_last:
6392 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6393 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6394 Ops.push_back(&II->getOperandUse(0));
6395 return !Ops.empty();
6396 case Intrinsic::aarch64_sme_write_horiz:
6397 case Intrinsic::aarch64_sme_write_vert:
6398 case Intrinsic::aarch64_sme_writeq_horiz:
6399 case Intrinsic::aarch64_sme_writeq_vert: {
6400 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6401 if (!Idx || Idx->getOpcode() != Instruction::Add)
6402 return false;
6403 Ops.push_back(&II->getOperandUse(1));
6404 return true;
6405 }
6406 case Intrinsic::aarch64_sme_read_horiz:
6407 case Intrinsic::aarch64_sme_read_vert:
6408 case Intrinsic::aarch64_sme_readq_horiz:
6409 case Intrinsic::aarch64_sme_readq_vert:
6410 case Intrinsic::aarch64_sme_ld1b_vert:
6411 case Intrinsic::aarch64_sme_ld1h_vert:
6412 case Intrinsic::aarch64_sme_ld1w_vert:
6413 case Intrinsic::aarch64_sme_ld1d_vert:
6414 case Intrinsic::aarch64_sme_ld1q_vert:
6415 case Intrinsic::aarch64_sme_st1b_vert:
6416 case Intrinsic::aarch64_sme_st1h_vert:
6417 case Intrinsic::aarch64_sme_st1w_vert:
6418 case Intrinsic::aarch64_sme_st1d_vert:
6419 case Intrinsic::aarch64_sme_st1q_vert:
6420 case Intrinsic::aarch64_sme_ld1b_horiz:
6421 case Intrinsic::aarch64_sme_ld1h_horiz:
6422 case Intrinsic::aarch64_sme_ld1w_horiz:
6423 case Intrinsic::aarch64_sme_ld1d_horiz:
6424 case Intrinsic::aarch64_sme_ld1q_horiz:
6425 case Intrinsic::aarch64_sme_st1b_horiz:
6426 case Intrinsic::aarch64_sme_st1h_horiz:
6427 case Intrinsic::aarch64_sme_st1w_horiz:
6428 case Intrinsic::aarch64_sme_st1d_horiz:
6429 case Intrinsic::aarch64_sme_st1q_horiz: {
6430 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6431 if (!Idx || Idx->getOpcode() != Instruction::Add)
6432 return false;
6433 Ops.push_back(&II->getOperandUse(3));
6434 return true;
6435 }
6436 case Intrinsic::aarch64_neon_pmull:
6437 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6438 return false;
6439 Ops.push_back(&II->getOperandUse(0));
6440 Ops.push_back(&II->getOperandUse(1));
6441 return true;
6442 case Intrinsic::aarch64_neon_pmull64:
6443 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6444 II->getArgOperand(1)))
6445 return false;
6446 Ops.push_back(&II->getArgOperandUse(0));
6447 Ops.push_back(&II->getArgOperandUse(1));
6448 return true;
6449 case Intrinsic::masked_gather:
6450 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6451 return false;
6452 Ops.push_back(&II->getArgOperandUse(0));
6453 return true;
6454 case Intrinsic::masked_scatter:
6455 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6456 return false;
6457 Ops.push_back(&II->getArgOperandUse(1));
6458 return true;
6459 default:
6460 return false;
6461 }
6462 }
6463
6464 auto ShouldSinkCondition = [](Value *Cond,
6465 SmallVectorImpl<Use *> &Ops) -> bool {
6467 return false;
6469 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6470 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
6471 return false;
6472 if (isa<CmpInst>(II->getOperand(0)))
6473 Ops.push_back(&II->getOperandUse(0));
6474 return true;
6475 };
6476
6477 switch (I->getOpcode()) {
6478 case Instruction::GetElementPtr:
6479 case Instruction::Add:
6480 case Instruction::Sub:
6481 // Sink vscales closer to uses for better isel
6482 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6483 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6484 Ops.push_back(&I->getOperandUse(Op));
6485 return true;
6486 }
6487 }
6488 break;
6489 case Instruction::Select: {
6490 if (!ShouldSinkCondition(I->getOperand(0), Ops))
6491 return false;
6492
6493 Ops.push_back(&I->getOperandUse(0));
6494 return true;
6495 }
6496 case Instruction::Br: {
6497 if (cast<BranchInst>(I)->isUnconditional())
6498 return false;
6499
6500 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops))
6501 return false;
6502
6503 Ops.push_back(&I->getOperandUse(0));
6504 return true;
6505 }
6506 default:
6507 break;
6508 }
6509
6510 if (!I->getType()->isVectorTy())
6511 return false;
6512
6513 switch (I->getOpcode()) {
6514 case Instruction::Sub:
6515 case Instruction::Add: {
6516 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
6517 return false;
6518
6519 // If the exts' operands extract either the lower or upper elements, we
6520 // can sink them too.
6521 auto Ext1 = cast<Instruction>(I->getOperand(0));
6522 auto Ext2 = cast<Instruction>(I->getOperand(1));
6523 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
6524 Ops.push_back(&Ext1->getOperandUse(0));
6525 Ops.push_back(&Ext2->getOperandUse(0));
6526 }
6527
6528 Ops.push_back(&I->getOperandUse(0));
6529 Ops.push_back(&I->getOperandUse(1));
6530
6531 return true;
6532 }
6533 case Instruction::Or: {
6534 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6535 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6536 if (ST->hasNEON()) {
6537 Instruction *OtherAnd, *IA, *IB;
6538 Value *MaskValue;
6539 // MainAnd refers to And instruction that has 'Not' as one of its operands
6540 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
6541 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
6542 m_Instruction(IA)))))) {
6543 if (match(OtherAnd,
6544 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
6545 Instruction *MainAnd = I->getOperand(0) == OtherAnd
6546 ? cast<Instruction>(I->getOperand(1))
6547 : cast<Instruction>(I->getOperand(0));
6548
6549 // Both Ands should be in same basic block as Or
6550 if (I->getParent() != MainAnd->getParent() ||
6551 I->getParent() != OtherAnd->getParent())
6552 return false;
6553
6554 // Non-mask operands of both Ands should also be in same basic block
6555 if (I->getParent() != IA->getParent() ||
6556 I->getParent() != IB->getParent())
6557 return false;
6558
6559 Ops.push_back(
6560 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
6561 Ops.push_back(&I->getOperandUse(0));
6562 Ops.push_back(&I->getOperandUse(1));
6563
6564 return true;
6565 }
6566 }
6567 }
6568
6569 return false;
6570 }
6571 case Instruction::Mul: {
6572 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6573 auto *Ty = cast<VectorType>(V->getType());
6574 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6575 if (Ty->isScalableTy())
6576 return false;
6577
6578 // Indexed variants of Mul exist for i16 and i32 element types only.
6579 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
6580 };
6581
6582 int NumZExts = 0, NumSExts = 0;
6583 for (auto &Op : I->operands()) {
6584 // Make sure we are not already sinking this operand
6585 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
6586 continue;
6587
6588 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
6589 auto *Ext = cast<Instruction>(Op);
6590 auto *ExtOp = Ext->getOperand(0);
6591 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
6592 Ops.push_back(&Ext->getOperandUse(0));
6593 Ops.push_back(&Op);
6594
6595 if (isa<SExtInst>(Ext))
6596 NumSExts++;
6597 else
6598 NumZExts++;
6599
6600 continue;
6601 }
6602
6604 if (!Shuffle)
6605 continue;
6606
6607 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
6608 // operand and the s/zext can help create indexed s/umull. This is
6609 // especially useful to prevent i64 mul being scalarized.
6610 if (isSplatShuffle(Shuffle) &&
6611 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
6612 Ops.push_back(&Shuffle->getOperandUse(0));
6613 Ops.push_back(&Op);
6614 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
6615 NumSExts++;
6616 else
6617 NumZExts++;
6618 continue;
6619 }
6620
6621 Value *ShuffleOperand = Shuffle->getOperand(0);
6622 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
6623 if (!Insert)
6624 continue;
6625
6626 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
6627 if (!OperandInstr)
6628 continue;
6629
6630 ConstantInt *ElementConstant =
6631 dyn_cast<ConstantInt>(Insert->getOperand(2));
6632 // Check that the insertelement is inserting into element 0
6633 if (!ElementConstant || !ElementConstant->isZero())
6634 continue;
6635
6636 unsigned Opcode = OperandInstr->getOpcode();
6637 if (Opcode == Instruction::SExt)
6638 NumSExts++;
6639 else if (Opcode == Instruction::ZExt)
6640 NumZExts++;
6641 else {
6642 // If we find that the top bits are known 0, then we can sink and allow
6643 // the backend to generate a umull.
6644 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
6645 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
6646 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
6647 continue;
6648 NumZExts++;
6649 }
6650
6651 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
6652 // the And, just to hoist it again back to the load.
6653 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
6654 Ops.push_back(&Insert->getOperandUse(1));
6655 Ops.push_back(&Shuffle->getOperandUse(0));
6656 Ops.push_back(&Op);
6657 }
6658
6659 // It is profitable to sink if we found two of the same type of extends.
6660 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
6661 return true;
6662
6663 // Otherwise, see if we should sink splats for indexed variants.
6664 if (!ShouldSinkSplatForIndexedVariant(I))
6665 return false;
6666
6667 Ops.clear();
6668 if (isSplatShuffle(I->getOperand(0)))
6669 Ops.push_back(&I->getOperandUse(0));
6670 if (isSplatShuffle(I->getOperand(1)))
6671 Ops.push_back(&I->getOperandUse(1));
6672
6673 return !Ops.empty();
6674 }
6675 case Instruction::FMul: {
6676 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6677 if (I->getType()->isScalableTy())
6678 return false;
6679
6680 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6681 !ST->hasFullFP16())
6682 return false;
6683
6684 // Sink splats for index lane variants
6685 if (isSplatShuffle(I->getOperand(0)))
6686 Ops.push_back(&I->getOperandUse(0));
6687 if (isSplatShuffle(I->getOperand(1)))
6688 Ops.push_back(&I->getOperandUse(1));
6689 return !Ops.empty();
6690 }
6691 default:
6692 return false;
6693 }
6694 return false;
6695}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
Hexagon Common GEP
const HexagonInstrInfo * TII
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
static unsigned getNumElements(Type *Ty)
#define LLVM_DEBUG(...)
Definition Debug.h:114
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const override
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool enableScalableVectorization() const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
unsigned countLeadingOnes() const
Definition APInt.h:1624
void negate()
Negate this APInt in place.
Definition APInt.h:1468
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:681
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:708
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:684
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:682
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:683
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:686
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:689
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:685
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:687
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:706
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:694
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:688
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:778
bool isUnsigned() const
Definition InstrTypes.h:938
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:669
bool empty() const
Definition DenseMap.h:107
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:156
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowContract() const
Definition FMF.h:69
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2571
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1107
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2559
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:595
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:580
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:1923
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2286
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2494
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1714
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2204
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1847
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2593
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1860
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:590
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2277
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
This instruction inserts a single (scalar) element into a VectorType value.
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition MapVector.h:56
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:825
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:31
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:41
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:40
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:710
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
virtual const DataLayout & getDataLayout() const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:62
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:285
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:310
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1707
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2454
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
@ Uninitialized
Definition Threading.h:60
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1714
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1721
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1879
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2090
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:384
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool isFixedLengthVector() const
Definition ValueTypes.h:181
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
bool isVariant() const
Definition MCSchedule.h:144
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...