LLVM 22.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
80namespace {
81class TailFoldingOption {
82 // These bitfields will only ever be set to something non-zero in operator=,
83 // when setting the -sve-tail-folding option. This option should always be of
84 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
85 // InitialBits is one of (disabled|all|simple). EnableBits represents
86 // additional flags we're enabling, and DisableBits for those flags we're
87 // disabling. The default flag is tracked in the variable NeedsDefault, since
88 // at the time of setting the option we may not know what the default value
89 // for the CPU is.
93
94 // This value needs to be initialised to true in case the user does not
95 // explicitly set the -sve-tail-folding option.
96 bool NeedsDefault = true;
97
98 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
99
100 void setNeedsDefault(bool V) { NeedsDefault = V; }
101
102 void setEnableBit(TailFoldingOpts Bit) {
103 EnableBits |= Bit;
104 DisableBits &= ~Bit;
105 }
106
107 void setDisableBit(TailFoldingOpts Bit) {
108 EnableBits &= ~Bit;
109 DisableBits |= Bit;
110 }
111
112 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
113 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
114
115 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
116 "Initial bits should only include one of "
117 "(disabled|all|simple|default)");
118 Bits = NeedsDefault ? DefaultBits : InitialBits;
119 Bits |= EnableBits;
120 Bits &= ~DisableBits;
121
122 return Bits;
123 }
124
125 void reportError(std::string Opt) {
126 errs() << "invalid argument '" << Opt
127 << "' to -sve-tail-folding=; the option should be of the form\n"
128 " (disabled|all|default|simple)[+(reductions|recurrences"
129 "|reverse|noreductions|norecurrences|noreverse)]\n";
130 report_fatal_error("Unrecognised tail-folding option");
131 }
132
133public:
134
135 void operator=(const std::string &Val) {
136 // If the user explicitly sets -sve-tail-folding= then treat as an error.
137 if (Val.empty()) {
138 reportError("");
139 return;
140 }
141
142 // Since the user is explicitly setting the option we don't automatically
143 // need the default unless they require it.
144 setNeedsDefault(false);
145
146 SmallVector<StringRef, 4> TailFoldTypes;
147 StringRef(Val).split(TailFoldTypes, '+', -1, false);
148
149 unsigned StartIdx = 1;
150 if (TailFoldTypes[0] == "disabled")
151 setInitialBits(TailFoldingOpts::Disabled);
152 else if (TailFoldTypes[0] == "all")
153 setInitialBits(TailFoldingOpts::All);
154 else if (TailFoldTypes[0] == "default")
155 setNeedsDefault(true);
156 else if (TailFoldTypes[0] == "simple")
157 setInitialBits(TailFoldingOpts::Simple);
158 else {
159 StartIdx = 0;
160 setInitialBits(TailFoldingOpts::Disabled);
161 }
162
163 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
164 if (TailFoldTypes[I] == "reductions")
165 setEnableBit(TailFoldingOpts::Reductions);
166 else if (TailFoldTypes[I] == "recurrences")
167 setEnableBit(TailFoldingOpts::Recurrences);
168 else if (TailFoldTypes[I] == "reverse")
169 setEnableBit(TailFoldingOpts::Reverse);
170 else if (TailFoldTypes[I] == "noreductions")
171 setDisableBit(TailFoldingOpts::Reductions);
172 else if (TailFoldTypes[I] == "norecurrences")
173 setDisableBit(TailFoldingOpts::Recurrences);
174 else if (TailFoldTypes[I] == "noreverse")
175 setDisableBit(TailFoldingOpts::Reverse);
176 else
177 reportError(Val);
178 }
179 }
180
181 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
182 return (getBits(DefaultBits) & Required) == Required;
183 }
184};
185} // namespace
186
187TailFoldingOption TailFoldingOptionLoc;
188
190 "sve-tail-folding",
191 cl::desc(
192 "Control the use of vectorisation using tail-folding for SVE where the"
193 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
194 "\ndisabled (Initial) No loop types will vectorize using "
195 "tail-folding"
196 "\ndefault (Initial) Uses the default tail-folding settings for "
197 "the target CPU"
198 "\nall (Initial) All legal loop types will vectorize using "
199 "tail-folding"
200 "\nsimple (Initial) Use tail-folding for simple loops (not "
201 "reductions or recurrences)"
202 "\nreductions Use tail-folding for loops containing reductions"
203 "\nnoreductions Inverse of above"
204 "\nrecurrences Use tail-folding for loops containing fixed order "
205 "recurrences"
206 "\nnorecurrences Inverse of above"
207 "\nreverse Use tail-folding for loops requiring reversed "
208 "predicates"
209 "\nnoreverse Inverse of above"),
211
212// Experimental option that will only be fully functional when the
213// code-generator is changed to use SVE instead of NEON for all fixed-width
214// operations.
216 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
217
218// Experimental option that will only be fully functional when the cost-model
219// and code-generator have been changed to avoid using scalable vector
220// instructions that are not legal in streaming SVE mode.
222 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
223
224static bool isSMEABIRoutineCall(const CallInst &CI,
225 const AArch64TargetLowering &TLI) {
226 const auto *F = CI.getCalledFunction();
227 return F && SMEAttrs(F->getName(), TLI).isSMEABIRoutine();
228}
229
230/// Returns true if the function has explicit operations that can only be
231/// lowered using incompatible instructions for the selected mode. This also
232/// returns true if the function F may use or modify ZA state.
234 const AArch64TargetLowering &TLI) {
235 for (const BasicBlock &BB : *F) {
236 for (const Instruction &I : BB) {
237 // Be conservative for now and assume that any call to inline asm or to
238 // intrinsics could could result in non-streaming ops (e.g. calls to
239 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
240 // all native LLVM instructions can be lowered to compatible instructions.
241 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
242 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
244 return true;
245 }
246 }
247 return false;
248}
249
251 StringRef AttributeStr =
252 isMultiversionedFunction(F) ? "fmv-features" : "target-features";
253 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
255 FeatureStr.split(Features, ",");
256 return AArch64::getFMVPriority(Features);
257}
258
260 return F.hasFnAttribute("fmv-features");
261}
262
263const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
264 AArch64::FeatureExecuteOnly,
265};
266
268 const Function *Callee) const {
269 SMECallAttrs CallAttrs(*Caller, *Callee);
270
271 // Never inline a function explicitly marked as being streaming,
272 // into a non-streaming function. Assume it was marked as streaming
273 // for a reason.
274 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
276 return false;
277
278 // When inlining, we should consider the body of the function, not the
279 // interface.
280 if (CallAttrs.callee().hasStreamingBody()) {
281 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
282 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
283 }
284
285 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
286 return false;
287
288 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
289 CallAttrs.requiresPreservingZT0() ||
290 CallAttrs.requiresPreservingAllZAState()) {
291 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
292 return false;
293 }
294
295 const TargetMachine &TM = getTLI()->getTargetMachine();
296 const FeatureBitset &CallerBits =
297 TM.getSubtargetImpl(*Caller)->getFeatureBits();
298 const FeatureBitset &CalleeBits =
299 TM.getSubtargetImpl(*Callee)->getFeatureBits();
300 // Adjust the feature bitsets by inverting some of the bits. This is needed
301 // for target features that represent restrictions rather than capabilities,
302 // for example a "+execute-only" callee can be inlined into a caller without
303 // "+execute-only", but not vice versa.
304 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
305 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
306
307 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
308}
309
311 const Function *Caller, const Function *Callee,
312 const ArrayRef<Type *> &Types) const {
313 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
314 return false;
315
316 // We need to ensure that argument promotion does not attempt to promote
317 // pointers to fixed-length vector types larger than 128 bits like
318 // <8 x float> (and pointers to aggregate types which have such fixed-length
319 // vector type members) into the values of the pointees. Such vector types
320 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
321 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
322 // types can be safely treated as 128-bit NEON types and they cannot be
323 // distinguished in IR.
324 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
325 auto FVTy = dyn_cast<FixedVectorType>(Ty);
326 return FVTy &&
327 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
328 }))
329 return false;
330
331 return true;
332}
333
334unsigned
336 unsigned DefaultCallPenalty) const {
337 // This function calculates a penalty for executing Call in F.
338 //
339 // There are two ways this function can be called:
340 // (1) F:
341 // call from F -> G (the call here is Call)
342 //
343 // For (1), Call.getCaller() == F, so it will always return a high cost if
344 // a streaming-mode change is required (thus promoting the need to inline the
345 // function)
346 //
347 // (2) F:
348 // call from F -> G (the call here is not Call)
349 // G:
350 // call from G -> H (the call here is Call)
351 //
352 // For (2), if after inlining the body of G into F the call to H requires a
353 // streaming-mode change, and the call to G from F would also require a
354 // streaming-mode change, then there is benefit to do the streaming-mode
355 // change only once and avoid inlining of G into F.
356
357 SMEAttrs FAttrs(*F);
358 SMECallAttrs CallAttrs(Call, getTLI());
359
360 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
361 if (F == Call.getCaller()) // (1)
362 return CallPenaltyChangeSM * DefaultCallPenalty;
363 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
364 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
365 }
366
367 return DefaultCallPenalty;
368}
369
376
377/// Calculate the cost of materializing a 64-bit value. This helper
378/// method might only calculate a fraction of a larger immediate. Therefore it
379/// is valid to return a cost of ZERO.
381 // Check if the immediate can be encoded within an instruction.
382 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
383 return 0;
384
385 if (Val < 0)
386 Val = ~Val;
387
388 // Calculate how many moves we will need to materialize this constant.
390 AArch64_IMM::expandMOVImm(Val, 64, Insn);
391 return Insn.size();
392}
393
394/// Calculate the cost of materializing the given constant.
398 assert(Ty->isIntegerTy());
399
400 unsigned BitSize = Ty->getPrimitiveSizeInBits();
401 if (BitSize == 0)
402 return ~0U;
403
404 // Sign-extend all constants to a multiple of 64-bit.
405 APInt ImmVal = Imm;
406 if (BitSize & 0x3f)
407 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
408
409 // Split the constant into 64-bit chunks and calculate the cost for each
410 // chunk.
412 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
413 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
414 int64_t Val = Tmp.getSExtValue();
415 Cost += getIntImmCost(Val);
416 }
417 // We need at least one instruction to materialze the constant.
418 return std::max<InstructionCost>(1, Cost);
419}
420
422 const APInt &Imm, Type *Ty,
424 Instruction *Inst) const {
425 assert(Ty->isIntegerTy());
426
427 unsigned BitSize = Ty->getPrimitiveSizeInBits();
428 // There is no cost model for constants with a bit size of 0. Return TCC_Free
429 // here, so that constant hoisting will ignore this constant.
430 if (BitSize == 0)
431 return TTI::TCC_Free;
432
433 unsigned ImmIdx = ~0U;
434 switch (Opcode) {
435 default:
436 return TTI::TCC_Free;
437 case Instruction::GetElementPtr:
438 // Always hoist the base address of a GetElementPtr.
439 if (Idx == 0)
440 return 2 * TTI::TCC_Basic;
441 return TTI::TCC_Free;
442 case Instruction::Store:
443 ImmIdx = 0;
444 break;
445 case Instruction::Add:
446 case Instruction::Sub:
447 case Instruction::Mul:
448 case Instruction::UDiv:
449 case Instruction::SDiv:
450 case Instruction::URem:
451 case Instruction::SRem:
452 case Instruction::And:
453 case Instruction::Or:
454 case Instruction::Xor:
455 case Instruction::ICmp:
456 ImmIdx = 1;
457 break;
458 // Always return TCC_Free for the shift value of a shift instruction.
459 case Instruction::Shl:
460 case Instruction::LShr:
461 case Instruction::AShr:
462 if (Idx == 1)
463 return TTI::TCC_Free;
464 break;
465 case Instruction::Trunc:
466 case Instruction::ZExt:
467 case Instruction::SExt:
468 case Instruction::IntToPtr:
469 case Instruction::PtrToInt:
470 case Instruction::BitCast:
471 case Instruction::PHI:
472 case Instruction::Call:
473 case Instruction::Select:
474 case Instruction::Ret:
475 case Instruction::Load:
476 break;
477 }
478
479 if (Idx == ImmIdx) {
480 int NumConstants = (BitSize + 63) / 64;
482 return (Cost <= NumConstants * TTI::TCC_Basic)
483 ? static_cast<int>(TTI::TCC_Free)
484 : Cost;
485 }
487}
488
491 const APInt &Imm, Type *Ty,
493 assert(Ty->isIntegerTy());
494
495 unsigned BitSize = Ty->getPrimitiveSizeInBits();
496 // There is no cost model for constants with a bit size of 0. Return TCC_Free
497 // here, so that constant hoisting will ignore this constant.
498 if (BitSize == 0)
499 return TTI::TCC_Free;
500
501 // Most (all?) AArch64 intrinsics do not support folding immediates into the
502 // selected instruction, so we compute the materialization cost for the
503 // immediate directly.
504 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
506
507 switch (IID) {
508 default:
509 return TTI::TCC_Free;
510 case Intrinsic::sadd_with_overflow:
511 case Intrinsic::uadd_with_overflow:
512 case Intrinsic::ssub_with_overflow:
513 case Intrinsic::usub_with_overflow:
514 case Intrinsic::smul_with_overflow:
515 case Intrinsic::umul_with_overflow:
516 if (Idx == 1) {
517 int NumConstants = (BitSize + 63) / 64;
519 return (Cost <= NumConstants * TTI::TCC_Basic)
520 ? static_cast<int>(TTI::TCC_Free)
521 : Cost;
522 }
523 break;
524 case Intrinsic::experimental_stackmap:
525 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
526 return TTI::TCC_Free;
527 break;
528 case Intrinsic::experimental_patchpoint_void:
529 case Intrinsic::experimental_patchpoint:
530 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
531 return TTI::TCC_Free;
532 break;
533 case Intrinsic::experimental_gc_statepoint:
534 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
535 return TTI::TCC_Free;
536 break;
537 }
539}
540
542AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
543 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
544 if (TyWidth == 32 || TyWidth == 64)
546 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
547 return TTI::PSK_Software;
548}
549
550static bool isUnpackedVectorVT(EVT VecVT) {
551 return VecVT.isScalableVector() &&
553}
554
556 const IntrinsicCostAttributes &ICA) {
557 // We need to know at least the number of elements in the vector of buckets
558 // and the size of each element to update.
559 if (ICA.getArgTypes().size() < 2)
561
562 // Only interested in costing for the hardware instruction from SVE2.
563 if (!ST->hasSVE2())
565
566 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
567 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
568 unsigned TotalHistCnts = 1;
569
570 unsigned EltSize = EltTy->getScalarSizeInBits();
571 // Only allow (up to 64b) integers or pointers
572 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
574
575 // FIXME: We should be able to generate histcnt for fixed-length vectors
576 // using ptrue with a specific VL.
577 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
578 unsigned EC = VTy->getElementCount().getKnownMinValue();
579 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
581
582 // HistCnt only supports 32b and 64b element types
583 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
584
585 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
587
588 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
589 TotalHistCnts = EC / NaturalVectorWidth;
590
591 return InstructionCost(BaseHistCntCost * TotalHistCnts);
592 }
593
595}
596
600 // The code-generator is currently not able to handle scalable vectors
601 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
602 // it. This change will be removed when code-generation for these types is
603 // sufficiently reliable.
604 auto *RetTy = ICA.getReturnType();
605 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
606 if (VTy->getElementCount() == ElementCount::getScalable(1))
608
609 switch (ICA.getID()) {
610 case Intrinsic::experimental_vector_histogram_add: {
611 InstructionCost HistCost = getHistogramCost(ST, ICA);
612 // If the cost isn't valid, we may still be able to scalarize
613 if (HistCost.isValid())
614 return HistCost;
615 break;
616 }
617 case Intrinsic::umin:
618 case Intrinsic::umax:
619 case Intrinsic::smin:
620 case Intrinsic::smax: {
621 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
622 MVT::v8i16, MVT::v2i32, MVT::v4i32,
623 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
624 MVT::nxv2i64};
625 auto LT = getTypeLegalizationCost(RetTy);
626 // v2i64 types get converted to cmp+bif hence the cost of 2
627 if (LT.second == MVT::v2i64)
628 return LT.first * 2;
629 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
630 return LT.first;
631 break;
632 }
633 case Intrinsic::sadd_sat:
634 case Intrinsic::ssub_sat:
635 case Intrinsic::uadd_sat:
636 case Intrinsic::usub_sat: {
637 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
638 MVT::v8i16, MVT::v2i32, MVT::v4i32,
639 MVT::v2i64};
640 auto LT = getTypeLegalizationCost(RetTy);
641 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
642 // need to extend the type, as it uses shr(qadd(shl, shl)).
643 unsigned Instrs =
644 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
645 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
646 return LT.first * Instrs;
647
649 uint64_t VectorSize = TS.getKnownMinValue();
650
651 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
652 return LT.first * Instrs;
653
654 break;
655 }
656 case Intrinsic::abs: {
657 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
658 MVT::v8i16, MVT::v2i32, MVT::v4i32,
659 MVT::v2i64};
660 auto LT = getTypeLegalizationCost(RetTy);
661 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
662 return LT.first;
663 break;
664 }
665 case Intrinsic::bswap: {
666 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
667 MVT::v4i32, MVT::v2i64};
668 auto LT = getTypeLegalizationCost(RetTy);
669 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
670 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
671 return LT.first;
672 break;
673 }
674 case Intrinsic::fma:
675 case Intrinsic::fmuladd: {
676 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
677 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
678 Type *EltTy = RetTy->getScalarType();
679 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
680 (EltTy->isHalfTy() && ST->hasFullFP16()))
681 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
682 break;
683 }
684 case Intrinsic::stepvector: {
685 InstructionCost Cost = 1; // Cost of the `index' instruction
686 auto LT = getTypeLegalizationCost(RetTy);
687 // Legalisation of illegal vectors involves an `index' instruction plus
688 // (LT.first - 1) vector adds.
689 if (LT.first > 1) {
690 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
691 InstructionCost AddCost =
692 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
693 Cost += AddCost * (LT.first - 1);
694 }
695 return Cost;
696 }
697 case Intrinsic::vector_extract:
698 case Intrinsic::vector_insert: {
699 // If both the vector and subvector types are legal types and the index
700 // is 0, then this should be a no-op or simple operation; return a
701 // relatively low cost.
702
703 // If arguments aren't actually supplied, then we cannot determine the
704 // value of the index. We also want to skip predicate types.
705 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
707 break;
708
709 LLVMContext &C = RetTy->getContext();
710 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
711 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
712 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
713 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
714 // Skip this if either the vector or subvector types are unpacked
715 // SVE types; they may get lowered to stack stores and loads.
716 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
717 break;
718
720 getTLI()->getTypeConversion(C, SubVecVT);
722 getTLI()->getTypeConversion(C, VecVT);
723 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
724 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
725 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
726 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
727 return TTI::TCC_Free;
728 break;
729 }
730 case Intrinsic::bitreverse: {
731 static const CostTblEntry BitreverseTbl[] = {
732 {Intrinsic::bitreverse, MVT::i32, 1},
733 {Intrinsic::bitreverse, MVT::i64, 1},
734 {Intrinsic::bitreverse, MVT::v8i8, 1},
735 {Intrinsic::bitreverse, MVT::v16i8, 1},
736 {Intrinsic::bitreverse, MVT::v4i16, 2},
737 {Intrinsic::bitreverse, MVT::v8i16, 2},
738 {Intrinsic::bitreverse, MVT::v2i32, 2},
739 {Intrinsic::bitreverse, MVT::v4i32, 2},
740 {Intrinsic::bitreverse, MVT::v1i64, 2},
741 {Intrinsic::bitreverse, MVT::v2i64, 2},
742 };
743 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
744 const auto *Entry =
745 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
746 if (Entry) {
747 // Cost Model is using the legal type(i32) that i8 and i16 will be
748 // converted to +1 so that we match the actual lowering cost
749 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
750 TLI->getValueType(DL, RetTy, true) == MVT::i16)
751 return LegalisationCost.first * Entry->Cost + 1;
752
753 return LegalisationCost.first * Entry->Cost;
754 }
755 break;
756 }
757 case Intrinsic::ctpop: {
758 if (!ST->hasNEON()) {
759 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
760 return getTypeLegalizationCost(RetTy).first * 12;
761 }
762 static const CostTblEntry CtpopCostTbl[] = {
763 {ISD::CTPOP, MVT::v2i64, 4},
764 {ISD::CTPOP, MVT::v4i32, 3},
765 {ISD::CTPOP, MVT::v8i16, 2},
766 {ISD::CTPOP, MVT::v16i8, 1},
767 {ISD::CTPOP, MVT::i64, 4},
768 {ISD::CTPOP, MVT::v2i32, 3},
769 {ISD::CTPOP, MVT::v4i16, 2},
770 {ISD::CTPOP, MVT::v8i8, 1},
771 {ISD::CTPOP, MVT::i32, 5},
772 };
773 auto LT = getTypeLegalizationCost(RetTy);
774 MVT MTy = LT.second;
775 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
776 // Extra cost of +1 when illegal vector types are legalized by promoting
777 // the integer type.
778 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
779 RetTy->getScalarSizeInBits()
780 ? 1
781 : 0;
782 return LT.first * Entry->Cost + ExtraCost;
783 }
784 break;
785 }
786 case Intrinsic::sadd_with_overflow:
787 case Intrinsic::uadd_with_overflow:
788 case Intrinsic::ssub_with_overflow:
789 case Intrinsic::usub_with_overflow:
790 case Intrinsic::smul_with_overflow:
791 case Intrinsic::umul_with_overflow: {
792 static const CostTblEntry WithOverflowCostTbl[] = {
793 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
794 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
795 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
796 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
797 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
798 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
799 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
800 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
801 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
802 {Intrinsic::usub_with_overflow, MVT::i8, 3},
803 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
804 {Intrinsic::usub_with_overflow, MVT::i16, 3},
805 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
806 {Intrinsic::usub_with_overflow, MVT::i32, 1},
807 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
808 {Intrinsic::usub_with_overflow, MVT::i64, 1},
809 {Intrinsic::smul_with_overflow, MVT::i8, 5},
810 {Intrinsic::umul_with_overflow, MVT::i8, 4},
811 {Intrinsic::smul_with_overflow, MVT::i16, 5},
812 {Intrinsic::umul_with_overflow, MVT::i16, 4},
813 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
814 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
815 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
816 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
817 };
818 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
819 if (MTy.isSimple())
820 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
821 MTy.getSimpleVT()))
822 return Entry->Cost;
823 break;
824 }
825 case Intrinsic::fptosi_sat:
826 case Intrinsic::fptoui_sat: {
827 if (ICA.getArgTypes().empty())
828 break;
829 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
830 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
831 EVT MTy = TLI->getValueType(DL, RetTy);
832 // Check for the legal types, which are where the size of the input and the
833 // output are the same, or we are using cvt f64->i32 or f32->i64.
834 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
835 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
836 LT.second == MVT::v2f64)) {
837 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
838 (LT.second == MVT::f64 && MTy == MVT::i32) ||
839 (LT.second == MVT::f32 && MTy == MVT::i64)))
840 return LT.first;
841 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
842 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
843 MTy.getScalarSizeInBits() == 64)
844 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
845 }
846 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
847 // f32.
848 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
849 return LT.first + getIntrinsicInstrCost(
850 {ICA.getID(),
851 RetTy,
852 {ICA.getArgTypes()[0]->getWithNewType(
853 Type::getFloatTy(RetTy->getContext()))}},
854 CostKind);
855 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
856 (LT.second == MVT::f16 && MTy == MVT::i64) ||
857 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
858 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
859 return LT.first;
860 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
861 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
862 MTy.getScalarSizeInBits() == 32)
863 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
864 // Extending vector types v8f16->v8i32. These current scalarize but the
865 // codegen could be better.
866 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
867 MTy.getScalarSizeInBits() == 64)
868 return MTy.getVectorNumElements() * 3;
869
870 // If we can we use a legal convert followed by a min+max
871 if ((LT.second.getScalarType() == MVT::f32 ||
872 LT.second.getScalarType() == MVT::f64 ||
873 LT.second.getScalarType() == MVT::f16) &&
874 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
875 Type *LegalTy =
876 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
877 if (LT.second.isVector())
878 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
880 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
881 LegalTy, {LegalTy, LegalTy});
883 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
884 LegalTy, {LegalTy, LegalTy});
886 return LT.first * Cost +
887 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
888 : 1);
889 }
890 // Otherwise we need to follow the default expansion that clamps the value
891 // using a float min/max with a fcmp+sel for nan handling when signed.
892 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
893 RetTy = RetTy->getScalarType();
894 if (LT.second.isVector()) {
895 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
896 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
897 }
898 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
900 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
902 Cost +=
903 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
905 if (IsSigned) {
906 Type *CondTy = RetTy->getWithNewBitWidth(1);
907 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
909 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
911 }
912 return LT.first * Cost;
913 }
914 case Intrinsic::fshl:
915 case Intrinsic::fshr: {
916 if (ICA.getArgs().empty())
917 break;
918
919 // TODO: Add handling for fshl where third argument is not a constant.
920 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
921 if (!OpInfoZ.isConstant())
922 break;
923
924 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
925 if (OpInfoZ.isUniform()) {
926 static const CostTblEntry FshlTbl[] = {
927 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
928 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
929 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
930 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
931 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
932 // to avoid having to duplicate the costs.
933 const auto *Entry =
934 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
935 if (Entry)
936 return LegalisationCost.first * Entry->Cost;
937 }
938
939 auto TyL = getTypeLegalizationCost(RetTy);
940 if (!RetTy->isIntegerTy())
941 break;
942
943 // Estimate cost manually, as types like i8 and i16 will get promoted to
944 // i32 and CostTableLookup will ignore the extra conversion cost.
945 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
946 RetTy->getScalarSizeInBits() < 64) ||
947 (RetTy->getScalarSizeInBits() % 64 != 0);
948 unsigned ExtraCost = HigherCost ? 1 : 0;
949 if (RetTy->getScalarSizeInBits() == 32 ||
950 RetTy->getScalarSizeInBits() == 64)
951 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
952 // extr instruction.
953 else if (HigherCost)
954 ExtraCost = 1;
955 else
956 break;
957 return TyL.first + ExtraCost;
958 }
959 case Intrinsic::get_active_lane_mask: {
960 auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
961 if (RetTy) {
962 EVT RetVT = getTLI()->getValueType(DL, RetTy);
963 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
964 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
965 !getTLI()->isTypeLegal(RetVT)) {
966 // We don't have enough context at this point to determine if the mask
967 // is going to be kept live after the block, which will force the vXi1
968 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
969 // For now, we just assume the vectorizer created this intrinsic and
970 // the result will be the input for a PHI. In this case the cost will
971 // be extremely high for fixed-width vectors.
972 // NOTE: getScalarizationOverhead returns a cost that's far too
973 // pessimistic for the actual generated codegen. In reality there are
974 // two instructions generated per lane.
975 return RetTy->getNumElements() * 2;
976 }
977 }
978 break;
979 }
980 case Intrinsic::experimental_vector_match: {
981 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
982 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
983 unsigned SearchSize = NeedleTy->getNumElements();
984 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
985 // Base cost for MATCH instructions. At least on the Neoverse V2 and
986 // Neoverse V3, these are cheap operations with the same latency as a
987 // vector ADD. In most cases, however, we also need to do an extra DUP.
988 // For fixed-length vectors we currently need an extra five--six
989 // instructions besides the MATCH.
991 if (isa<FixedVectorType>(RetTy))
992 Cost += 10;
993 return Cost;
994 }
995 break;
996 }
997 case Intrinsic::experimental_cttz_elts: {
998 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
999 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1000 // This will consist of a SVE brkb and a cntp instruction. These
1001 // typically have the same latency and half the throughput as a vector
1002 // add instruction.
1003 return 4;
1004 }
1005 break;
1006 }
1007 default:
1008 break;
1009 }
1011}
1012
1013/// The function will remove redundant reinterprets casting in the presence
1014/// of the control flow
1015static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1016 IntrinsicInst &II) {
1018 auto RequiredType = II.getType();
1019
1020 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1021 assert(PN && "Expected Phi Node!");
1022
1023 // Don't create a new Phi unless we can remove the old one.
1024 if (!PN->hasOneUse())
1025 return std::nullopt;
1026
1027 for (Value *IncValPhi : PN->incoming_values()) {
1028 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1029 if (!Reinterpret ||
1030 Reinterpret->getIntrinsicID() !=
1031 Intrinsic::aarch64_sve_convert_to_svbool ||
1032 RequiredType != Reinterpret->getArgOperand(0)->getType())
1033 return std::nullopt;
1034 }
1035
1036 // Create the new Phi
1037 IC.Builder.SetInsertPoint(PN);
1038 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1039 Worklist.push_back(PN);
1040
1041 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1042 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1043 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1044 Worklist.push_back(Reinterpret);
1045 }
1046
1047 // Cleanup Phi Node and reinterprets
1048 return IC.replaceInstUsesWith(II, NPN);
1049}
1050
1051// A collection of properties common to SVE intrinsics that allow for combines
1052// to be written without needing to know the specific intrinsic.
1054 //
1055 // Helper routines for common intrinsic definitions.
1056 //
1057
1058 // e.g. llvm.aarch64.sve.add pg, op1, op2
1059 // with IID ==> llvm.aarch64.sve.add_u
1060 static SVEIntrinsicInfo
1067
1068 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1075
1076 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1082
1083 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1089
1090 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1091 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1092 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1093 return SVEIntrinsicInfo()
1096 }
1097
1098 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1099 // llvm.aarch64.sve.ld1 pg, ptr
1106
1107 // All properties relate to predication and thus having a general predicate
1108 // is the minimum requirement to say there is intrinsic info to act on.
1109 explicit operator bool() const { return hasGoverningPredicate(); }
1110
1111 //
1112 // Properties relating to the governing predicate.
1113 //
1114
1116 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1117 }
1118
1120 assert(hasGoverningPredicate() && "Propery not set!");
1121 return GoverningPredicateIdx;
1122 }
1123
1125 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1126 GoverningPredicateIdx = Index;
1127 return *this;
1128 }
1129
1130 //
1131 // Properties relating to operations the intrinsic could be transformed into.
1132 // NOTE: This does not mean such a transformation is always possible, but the
1133 // knowledge makes it possible to reuse existing optimisations without needing
1134 // to embed specific handling for each intrinsic. For example, instruction
1135 // simplification can be used to optimise an intrinsic's active lanes.
1136 //
1137
1139 return UndefIntrinsic != Intrinsic::not_intrinsic;
1140 }
1141
1143 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1144 return UndefIntrinsic;
1145 }
1146
1148 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1149 UndefIntrinsic = IID;
1150 return *this;
1151 }
1152
1153 bool hasMatchingIROpode() const { return IROpcode != 0; }
1154
1155 unsigned getMatchingIROpode() const {
1156 assert(hasMatchingIROpode() && "Propery not set!");
1157 return IROpcode;
1158 }
1159
1161 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1162 IROpcode = Opcode;
1163 return *this;
1164 }
1165
1166 //
1167 // Properties relating to the result of inactive lanes.
1168 //
1169
1171 return ResultLanes == InactiveLanesTakenFromOperand;
1172 }
1173
1175 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1176 return OperandIdxForInactiveLanes;
1177 }
1178
1180 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1181 ResultLanes = InactiveLanesTakenFromOperand;
1182 OperandIdxForInactiveLanes = Index;
1183 return *this;
1184 }
1185
1187 return ResultLanes == InactiveLanesAreNotDefined;
1188 }
1189
1191 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1192 ResultLanes = InactiveLanesAreNotDefined;
1193 return *this;
1194 }
1195
1197 return ResultLanes == InactiveLanesAreUnused;
1198 }
1199
1201 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1202 ResultLanes = InactiveLanesAreUnused;
1203 return *this;
1204 }
1205
1206 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1207 // inactiveLanesAreZeroed =
1208 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1209 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1210
1212 ResultIsZeroInitialized = true;
1213 return *this;
1214 }
1215
1216 //
1217 // The first operand of unary merging operations is typically only used to
1218 // set the result for inactive lanes. Knowing this allows us to deadcode the
1219 // operand when we can prove there are no inactive lanes.
1220 //
1221
1223 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1224 }
1225
1227 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1228 return OperandIdxWithNoActiveLanes;
1229 }
1230
1232 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1233 OperandIdxWithNoActiveLanes = Index;
1234 return *this;
1235 }
1236
1237private:
1238 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1239
1240 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1241 unsigned IROpcode = 0;
1242
1243 enum PredicationStyle {
1245 InactiveLanesTakenFromOperand,
1246 InactiveLanesAreNotDefined,
1247 InactiveLanesAreUnused
1248 } ResultLanes = Uninitialized;
1249
1250 bool ResultIsZeroInitialized = false;
1251 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1252 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1253};
1254
1256 // Some SVE intrinsics do not use scalable vector types, but since they are
1257 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1258 if (!isa<ScalableVectorType>(II.getType()) &&
1259 all_of(II.args(), [&](const Value *V) {
1260 return !isa<ScalableVectorType>(V->getType());
1261 }))
1262 return SVEIntrinsicInfo();
1263
1264 Intrinsic::ID IID = II.getIntrinsicID();
1265 switch (IID) {
1266 default:
1267 break;
1268 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1269 case Intrinsic::aarch64_sve_fcvt_f16f32:
1270 case Intrinsic::aarch64_sve_fcvt_f16f64:
1271 case Intrinsic::aarch64_sve_fcvt_f32f16:
1272 case Intrinsic::aarch64_sve_fcvt_f32f64:
1273 case Intrinsic::aarch64_sve_fcvt_f64f16:
1274 case Intrinsic::aarch64_sve_fcvt_f64f32:
1275 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1276 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1277 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1278 case Intrinsic::aarch64_sve_fcvtzs:
1279 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1280 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1281 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1282 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1283 case Intrinsic::aarch64_sve_fcvtzu:
1284 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1285 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1286 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1287 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1288 case Intrinsic::aarch64_sve_scvtf:
1289 case Intrinsic::aarch64_sve_scvtf_f16i32:
1290 case Intrinsic::aarch64_sve_scvtf_f16i64:
1291 case Intrinsic::aarch64_sve_scvtf_f32i64:
1292 case Intrinsic::aarch64_sve_scvtf_f64i32:
1293 case Intrinsic::aarch64_sve_ucvtf:
1294 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1295 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1296 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1297 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1299
1300 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1301 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1302 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1303 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1305
1306 case Intrinsic::aarch64_sve_fabd:
1307 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1308 case Intrinsic::aarch64_sve_fadd:
1309 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1310 .setMatchingIROpcode(Instruction::FAdd);
1311 case Intrinsic::aarch64_sve_fdiv:
1312 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1313 .setMatchingIROpcode(Instruction::FDiv);
1314 case Intrinsic::aarch64_sve_fmax:
1315 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1316 case Intrinsic::aarch64_sve_fmaxnm:
1317 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1318 case Intrinsic::aarch64_sve_fmin:
1319 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1320 case Intrinsic::aarch64_sve_fminnm:
1321 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1322 case Intrinsic::aarch64_sve_fmla:
1323 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1324 case Intrinsic::aarch64_sve_fmls:
1325 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1326 case Intrinsic::aarch64_sve_fmul:
1327 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1328 .setMatchingIROpcode(Instruction::FMul);
1329 case Intrinsic::aarch64_sve_fmulx:
1330 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1331 case Intrinsic::aarch64_sve_fnmla:
1332 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1333 case Intrinsic::aarch64_sve_fnmls:
1334 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1335 case Intrinsic::aarch64_sve_fsub:
1336 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1337 .setMatchingIROpcode(Instruction::FSub);
1338 case Intrinsic::aarch64_sve_add:
1339 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1340 .setMatchingIROpcode(Instruction::Add);
1341 case Intrinsic::aarch64_sve_mla:
1342 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1343 case Intrinsic::aarch64_sve_mls:
1344 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1345 case Intrinsic::aarch64_sve_mul:
1346 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1347 .setMatchingIROpcode(Instruction::Mul);
1348 case Intrinsic::aarch64_sve_sabd:
1349 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1350 case Intrinsic::aarch64_sve_sdiv:
1351 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1352 .setMatchingIROpcode(Instruction::SDiv);
1353 case Intrinsic::aarch64_sve_smax:
1354 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1355 case Intrinsic::aarch64_sve_smin:
1356 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1357 case Intrinsic::aarch64_sve_smulh:
1358 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1359 case Intrinsic::aarch64_sve_sub:
1360 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1361 .setMatchingIROpcode(Instruction::Sub);
1362 case Intrinsic::aarch64_sve_uabd:
1363 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1364 case Intrinsic::aarch64_sve_udiv:
1365 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1366 .setMatchingIROpcode(Instruction::UDiv);
1367 case Intrinsic::aarch64_sve_umax:
1368 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1369 case Intrinsic::aarch64_sve_umin:
1370 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1371 case Intrinsic::aarch64_sve_umulh:
1372 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1373 case Intrinsic::aarch64_sve_asr:
1374 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1375 .setMatchingIROpcode(Instruction::AShr);
1376 case Intrinsic::aarch64_sve_lsl:
1377 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1378 .setMatchingIROpcode(Instruction::Shl);
1379 case Intrinsic::aarch64_sve_lsr:
1380 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1381 .setMatchingIROpcode(Instruction::LShr);
1382 case Intrinsic::aarch64_sve_and:
1383 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1384 .setMatchingIROpcode(Instruction::And);
1385 case Intrinsic::aarch64_sve_bic:
1386 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1387 case Intrinsic::aarch64_sve_eor:
1388 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1389 .setMatchingIROpcode(Instruction::Xor);
1390 case Intrinsic::aarch64_sve_orr:
1391 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1392 .setMatchingIROpcode(Instruction::Or);
1393 case Intrinsic::aarch64_sve_sqsub:
1394 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1395 case Intrinsic::aarch64_sve_uqsub:
1396 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1397
1398 case Intrinsic::aarch64_sve_add_u:
1400 Instruction::Add);
1401 case Intrinsic::aarch64_sve_and_u:
1403 Instruction::And);
1404 case Intrinsic::aarch64_sve_asr_u:
1406 Instruction::AShr);
1407 case Intrinsic::aarch64_sve_eor_u:
1409 Instruction::Xor);
1410 case Intrinsic::aarch64_sve_fadd_u:
1412 Instruction::FAdd);
1413 case Intrinsic::aarch64_sve_fdiv_u:
1415 Instruction::FDiv);
1416 case Intrinsic::aarch64_sve_fmul_u:
1418 Instruction::FMul);
1419 case Intrinsic::aarch64_sve_fsub_u:
1421 Instruction::FSub);
1422 case Intrinsic::aarch64_sve_lsl_u:
1424 Instruction::Shl);
1425 case Intrinsic::aarch64_sve_lsr_u:
1427 Instruction::LShr);
1428 case Intrinsic::aarch64_sve_mul_u:
1430 Instruction::Mul);
1431 case Intrinsic::aarch64_sve_orr_u:
1433 Instruction::Or);
1434 case Intrinsic::aarch64_sve_sdiv_u:
1436 Instruction::SDiv);
1437 case Intrinsic::aarch64_sve_sub_u:
1439 Instruction::Sub);
1440 case Intrinsic::aarch64_sve_udiv_u:
1442 Instruction::UDiv);
1443
1444 case Intrinsic::aarch64_sve_addqv:
1445 case Intrinsic::aarch64_sve_and_z:
1446 case Intrinsic::aarch64_sve_bic_z:
1447 case Intrinsic::aarch64_sve_brka_z:
1448 case Intrinsic::aarch64_sve_brkb_z:
1449 case Intrinsic::aarch64_sve_brkn_z:
1450 case Intrinsic::aarch64_sve_brkpa_z:
1451 case Intrinsic::aarch64_sve_brkpb_z:
1452 case Intrinsic::aarch64_sve_cntp:
1453 case Intrinsic::aarch64_sve_compact:
1454 case Intrinsic::aarch64_sve_eor_z:
1455 case Intrinsic::aarch64_sve_eorv:
1456 case Intrinsic::aarch64_sve_eorqv:
1457 case Intrinsic::aarch64_sve_nand_z:
1458 case Intrinsic::aarch64_sve_nor_z:
1459 case Intrinsic::aarch64_sve_orn_z:
1460 case Intrinsic::aarch64_sve_orr_z:
1461 case Intrinsic::aarch64_sve_orv:
1462 case Intrinsic::aarch64_sve_orqv:
1463 case Intrinsic::aarch64_sve_pnext:
1464 case Intrinsic::aarch64_sve_rdffr_z:
1465 case Intrinsic::aarch64_sve_saddv:
1466 case Intrinsic::aarch64_sve_uaddv:
1467 case Intrinsic::aarch64_sve_umaxv:
1468 case Intrinsic::aarch64_sve_umaxqv:
1469 case Intrinsic::aarch64_sve_cmpeq:
1470 case Intrinsic::aarch64_sve_cmpeq_wide:
1471 case Intrinsic::aarch64_sve_cmpge:
1472 case Intrinsic::aarch64_sve_cmpge_wide:
1473 case Intrinsic::aarch64_sve_cmpgt:
1474 case Intrinsic::aarch64_sve_cmpgt_wide:
1475 case Intrinsic::aarch64_sve_cmphi:
1476 case Intrinsic::aarch64_sve_cmphi_wide:
1477 case Intrinsic::aarch64_sve_cmphs:
1478 case Intrinsic::aarch64_sve_cmphs_wide:
1479 case Intrinsic::aarch64_sve_cmple_wide:
1480 case Intrinsic::aarch64_sve_cmplo_wide:
1481 case Intrinsic::aarch64_sve_cmpls_wide:
1482 case Intrinsic::aarch64_sve_cmplt_wide:
1483 case Intrinsic::aarch64_sve_cmpne:
1484 case Intrinsic::aarch64_sve_cmpne_wide:
1485 case Intrinsic::aarch64_sve_facge:
1486 case Intrinsic::aarch64_sve_facgt:
1487 case Intrinsic::aarch64_sve_fcmpeq:
1488 case Intrinsic::aarch64_sve_fcmpge:
1489 case Intrinsic::aarch64_sve_fcmpgt:
1490 case Intrinsic::aarch64_sve_fcmpne:
1491 case Intrinsic::aarch64_sve_fcmpuo:
1492 case Intrinsic::aarch64_sve_ld1:
1493 case Intrinsic::aarch64_sve_ld1_gather:
1494 case Intrinsic::aarch64_sve_ld1_gather_index:
1495 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1496 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1497 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1498 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1499 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1500 case Intrinsic::aarch64_sve_ld1q_gather_index:
1501 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1502 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1503 case Intrinsic::aarch64_sve_ld1ro:
1504 case Intrinsic::aarch64_sve_ld1rq:
1505 case Intrinsic::aarch64_sve_ld1udq:
1506 case Intrinsic::aarch64_sve_ld1uwq:
1507 case Intrinsic::aarch64_sve_ld2_sret:
1508 case Intrinsic::aarch64_sve_ld2q_sret:
1509 case Intrinsic::aarch64_sve_ld3_sret:
1510 case Intrinsic::aarch64_sve_ld3q_sret:
1511 case Intrinsic::aarch64_sve_ld4_sret:
1512 case Intrinsic::aarch64_sve_ld4q_sret:
1513 case Intrinsic::aarch64_sve_ldff1:
1514 case Intrinsic::aarch64_sve_ldff1_gather:
1515 case Intrinsic::aarch64_sve_ldff1_gather_index:
1516 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1517 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1518 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1519 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1520 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1521 case Intrinsic::aarch64_sve_ldnf1:
1522 case Intrinsic::aarch64_sve_ldnt1:
1523 case Intrinsic::aarch64_sve_ldnt1_gather:
1524 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1525 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1526 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1528
1529 case Intrinsic::aarch64_sve_prf:
1530 case Intrinsic::aarch64_sve_prfb_gather_index:
1531 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1532 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1533 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1534 case Intrinsic::aarch64_sve_prfd_gather_index:
1535 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1536 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1537 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1538 case Intrinsic::aarch64_sve_prfh_gather_index:
1539 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1540 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1541 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1542 case Intrinsic::aarch64_sve_prfw_gather_index:
1543 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1544 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1545 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1547
1548 case Intrinsic::aarch64_sve_st1_scatter:
1549 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1550 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1551 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1552 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1553 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1554 case Intrinsic::aarch64_sve_st1dq:
1555 case Intrinsic::aarch64_sve_st1q_scatter_index:
1556 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1557 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1558 case Intrinsic::aarch64_sve_st1wq:
1559 case Intrinsic::aarch64_sve_stnt1:
1560 case Intrinsic::aarch64_sve_stnt1_scatter:
1561 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1562 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1563 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1565 case Intrinsic::aarch64_sve_st2:
1566 case Intrinsic::aarch64_sve_st2q:
1568 case Intrinsic::aarch64_sve_st3:
1569 case Intrinsic::aarch64_sve_st3q:
1571 case Intrinsic::aarch64_sve_st4:
1572 case Intrinsic::aarch64_sve_st4q:
1574 }
1575
1576 return SVEIntrinsicInfo();
1577}
1578
1579static bool isAllActivePredicate(Value *Pred) {
1580 // Look through convert.from.svbool(convert.to.svbool(...) chain.
1581 Value *UncastedPred;
1584 m_Value(UncastedPred)))))
1585 // If the predicate has the same or less lanes than the uncasted
1586 // predicate then we know the casting has no effect.
1587 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1588 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1589 Pred = UncastedPred;
1590 auto *C = dyn_cast<Constant>(Pred);
1591 return (C && C->isAllOnesValue());
1592}
1593
1594// Simplify `V` by only considering the operations that affect active lanes.
1595// This function should only return existing Values or newly created Constants.
1596static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1597 auto *Dup = dyn_cast<IntrinsicInst>(V);
1598 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1599 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1601 cast<VectorType>(V->getType())->getElementCount(),
1602 cast<Constant>(Dup->getOperand(2)));
1603
1604 return V;
1605}
1606
1607static std::optional<Instruction *>
1609 const SVEIntrinsicInfo &IInfo) {
1610 const unsigned Opc = IInfo.getMatchingIROpode();
1611 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1612
1613 Value *Pg = II.getOperand(0);
1614 Value *Op1 = II.getOperand(1);
1615 Value *Op2 = II.getOperand(2);
1616 const DataLayout &DL = II.getDataLayout();
1617
1618 // Canonicalise constants to the RHS.
1620 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1621 IC.replaceOperand(II, 1, Op2);
1622 IC.replaceOperand(II, 2, Op1);
1623 return &II;
1624 }
1625
1626 // Only active lanes matter when simplifying the operation.
1627 Op1 = stripInactiveLanes(Op1, Pg);
1628 Op2 = stripInactiveLanes(Op2, Pg);
1629
1630 Value *SimpleII;
1631 if (auto FII = dyn_cast<FPMathOperator>(&II))
1632 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1633 else
1634 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1635
1636 // An SVE intrinsic's result is always defined. However, this is not the case
1637 // for its equivalent IR instruction (e.g. when shifting by an amount more
1638 // than the data's bitwidth). Simplifications to an undefined result must be
1639 // ignored to preserve the intrinsic's expected behaviour.
1640 if (!SimpleII || isa<UndefValue>(SimpleII))
1641 return std::nullopt;
1642
1643 if (IInfo.inactiveLanesAreNotDefined())
1644 return IC.replaceInstUsesWith(II, SimpleII);
1645
1646 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1647
1648 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1649 if (SimpleII == Inactive)
1650 return IC.replaceInstUsesWith(II, SimpleII);
1651
1652 // Inactive lanes must be preserved.
1653 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1654 return IC.replaceInstUsesWith(II, SimpleII);
1655}
1656
1657// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1658// to operations with less strict inactive lane requirements.
1659static std::optional<Instruction *>
1661 const SVEIntrinsicInfo &IInfo) {
1662 if (!IInfo.hasGoverningPredicate())
1663 return std::nullopt;
1664
1665 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1666
1667 // If there are no active lanes.
1668 if (match(OpPredicate, m_ZeroInt())) {
1670 return IC.replaceInstUsesWith(
1671 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1672
1673 if (IInfo.inactiveLanesAreUnused()) {
1674 if (IInfo.resultIsZeroInitialized())
1676
1677 return IC.eraseInstFromFunction(II);
1678 }
1679 }
1680
1681 // If there are no inactive lanes.
1682 if (isAllActivePredicate(OpPredicate)) {
1683 if (IInfo.hasOperandWithNoActiveLanes()) {
1684 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1685 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1686 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1687 }
1688
1689 if (IInfo.hasMatchingUndefIntrinsic()) {
1690 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1691 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1692 II.setCalledFunction(NewDecl);
1693 return &II;
1694 }
1695 }
1696
1697 // Operation specific simplifications.
1698 if (IInfo.hasMatchingIROpode() &&
1700 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1701
1702 return std::nullopt;
1703}
1704
1705// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1706// => (binop (pred) (from_svbool _) (from_svbool _))
1707//
1708// The above transformation eliminates a `to_svbool` in the predicate
1709// operand of bitwise operation `binop` by narrowing the vector width of
1710// the operation. For example, it would convert a `<vscale x 16 x i1>
1711// and` into a `<vscale x 4 x i1> and`. This is profitable because
1712// to_svbool must zero the new lanes during widening, whereas
1713// from_svbool is free.
1714static std::optional<Instruction *>
1716 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1717 if (!BinOp)
1718 return std::nullopt;
1719
1720 auto IntrinsicID = BinOp->getIntrinsicID();
1721 switch (IntrinsicID) {
1722 case Intrinsic::aarch64_sve_and_z:
1723 case Intrinsic::aarch64_sve_bic_z:
1724 case Intrinsic::aarch64_sve_eor_z:
1725 case Intrinsic::aarch64_sve_nand_z:
1726 case Intrinsic::aarch64_sve_nor_z:
1727 case Intrinsic::aarch64_sve_orn_z:
1728 case Intrinsic::aarch64_sve_orr_z:
1729 break;
1730 default:
1731 return std::nullopt;
1732 }
1733
1734 auto BinOpPred = BinOp->getOperand(0);
1735 auto BinOpOp1 = BinOp->getOperand(1);
1736 auto BinOpOp2 = BinOp->getOperand(2);
1737
1738 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1739 if (!PredIntr ||
1740 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1741 return std::nullopt;
1742
1743 auto PredOp = PredIntr->getOperand(0);
1744 auto PredOpTy = cast<VectorType>(PredOp->getType());
1745 if (PredOpTy != II.getType())
1746 return std::nullopt;
1747
1748 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1749 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1750 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1751 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1752 if (BinOpOp1 == BinOpOp2)
1753 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1754 else
1755 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1756 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1757
1758 auto NarrowedBinOp =
1759 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1760 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1761}
1762
1763static std::optional<Instruction *>
1765 // If the reinterpret instruction operand is a PHI Node
1766 if (isa<PHINode>(II.getArgOperand(0)))
1767 return processPhiNode(IC, II);
1768
1769 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1770 return BinOpCombine;
1771
1772 // Ignore converts to/from svcount_t.
1773 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1774 isa<TargetExtType>(II.getType()))
1775 return std::nullopt;
1776
1777 SmallVector<Instruction *, 32> CandidatesForRemoval;
1778 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1779
1780 const auto *IVTy = cast<VectorType>(II.getType());
1781
1782 // Walk the chain of conversions.
1783 while (Cursor) {
1784 // If the type of the cursor has fewer lanes than the final result, zeroing
1785 // must take place, which breaks the equivalence chain.
1786 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1787 if (CursorVTy->getElementCount().getKnownMinValue() <
1788 IVTy->getElementCount().getKnownMinValue())
1789 break;
1790
1791 // If the cursor has the same type as I, it is a viable replacement.
1792 if (Cursor->getType() == IVTy)
1793 EarliestReplacement = Cursor;
1794
1795 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1796
1797 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1798 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1799 Intrinsic::aarch64_sve_convert_to_svbool ||
1800 IntrinsicCursor->getIntrinsicID() ==
1801 Intrinsic::aarch64_sve_convert_from_svbool))
1802 break;
1803
1804 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1805 Cursor = IntrinsicCursor->getOperand(0);
1806 }
1807
1808 // If no viable replacement in the conversion chain was found, there is
1809 // nothing to do.
1810 if (!EarliestReplacement)
1811 return std::nullopt;
1812
1813 return IC.replaceInstUsesWith(II, EarliestReplacement);
1814}
1815
1816static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1817 IntrinsicInst &II) {
1818 // svsel(ptrue, x, y) => x
1819 auto *OpPredicate = II.getOperand(0);
1820 if (isAllActivePredicate(OpPredicate))
1821 return IC.replaceInstUsesWith(II, II.getOperand(1));
1822
1823 auto Select =
1824 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1825 return IC.replaceInstUsesWith(II, Select);
1826}
1827
1828static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1829 IntrinsicInst &II) {
1830 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1831 if (!Pg)
1832 return std::nullopt;
1833
1834 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1835 return std::nullopt;
1836
1837 const auto PTruePattern =
1838 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1839 if (PTruePattern != AArch64SVEPredPattern::vl1)
1840 return std::nullopt;
1841
1842 // The intrinsic is inserting into lane zero so use an insert instead.
1843 auto *IdxTy = Type::getInt64Ty(II.getContext());
1844 auto *Insert = InsertElementInst::Create(
1845 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1846 Insert->insertBefore(II.getIterator());
1847 Insert->takeName(&II);
1848
1849 return IC.replaceInstUsesWith(II, Insert);
1850}
1851
1852static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1853 IntrinsicInst &II) {
1854 // Replace DupX with a regular IR splat.
1855 auto *RetTy = cast<ScalableVectorType>(II.getType());
1856 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1857 II.getArgOperand(0));
1858 Splat->takeName(&II);
1859 return IC.replaceInstUsesWith(II, Splat);
1860}
1861
1862static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1863 IntrinsicInst &II) {
1864 LLVMContext &Ctx = II.getContext();
1865
1866 if (!isAllActivePredicate(II.getArgOperand(0)))
1867 return std::nullopt;
1868
1869 // Check that we have a compare of zero..
1870 auto *SplatValue =
1872 if (!SplatValue || !SplatValue->isZero())
1873 return std::nullopt;
1874
1875 // ..against a dupq
1876 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1877 if (!DupQLane ||
1878 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1879 return std::nullopt;
1880
1881 // Where the dupq is a lane 0 replicate of a vector insert
1882 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1883 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1884 return std::nullopt;
1885
1886 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1887 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1888 return std::nullopt;
1889
1890 // Where the vector insert is a fixed constant vector insert into undef at
1891 // index zero
1892 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1893 return std::nullopt;
1894
1895 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1896 return std::nullopt;
1897
1898 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1899 if (!ConstVec)
1900 return std::nullopt;
1901
1902 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1903 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1904 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1905 return std::nullopt;
1906
1907 unsigned NumElts = VecTy->getNumElements();
1908 unsigned PredicateBits = 0;
1909
1910 // Expand intrinsic operands to a 16-bit byte level predicate
1911 for (unsigned I = 0; I < NumElts; ++I) {
1912 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1913 if (!Arg)
1914 return std::nullopt;
1915 if (!Arg->isZero())
1916 PredicateBits |= 1 << (I * (16 / NumElts));
1917 }
1918
1919 // If all bits are zero bail early with an empty predicate
1920 if (PredicateBits == 0) {
1921 auto *PFalse = Constant::getNullValue(II.getType());
1922 PFalse->takeName(&II);
1923 return IC.replaceInstUsesWith(II, PFalse);
1924 }
1925
1926 // Calculate largest predicate type used (where byte predicate is largest)
1927 unsigned Mask = 8;
1928 for (unsigned I = 0; I < 16; ++I)
1929 if ((PredicateBits & (1 << I)) != 0)
1930 Mask |= (I % 8);
1931
1932 unsigned PredSize = Mask & -Mask;
1933 auto *PredType = ScalableVectorType::get(
1934 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1935
1936 // Ensure all relevant bits are set
1937 for (unsigned I = 0; I < 16; I += PredSize)
1938 if ((PredicateBits & (1 << I)) == 0)
1939 return std::nullopt;
1940
1941 auto *PTruePat =
1942 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1943 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1944 {PredType}, {PTruePat});
1945 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1946 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1947 auto *ConvertFromSVBool =
1948 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1949 {II.getType()}, {ConvertToSVBool});
1950
1951 ConvertFromSVBool->takeName(&II);
1952 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1953}
1954
1955static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1956 IntrinsicInst &II) {
1957 Value *Pg = II.getArgOperand(0);
1958 Value *Vec = II.getArgOperand(1);
1959 auto IntrinsicID = II.getIntrinsicID();
1960 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1961
1962 // lastX(splat(X)) --> X
1963 if (auto *SplatVal = getSplatValue(Vec))
1964 return IC.replaceInstUsesWith(II, SplatVal);
1965
1966 // If x and/or y is a splat value then:
1967 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1968 Value *LHS, *RHS;
1969 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1970 if (isSplatValue(LHS) || isSplatValue(RHS)) {
1971 auto *OldBinOp = cast<BinaryOperator>(Vec);
1972 auto OpC = OldBinOp->getOpcode();
1973 auto *NewLHS =
1974 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1975 auto *NewRHS =
1976 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1978 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
1979 return IC.replaceInstUsesWith(II, NewBinOp);
1980 }
1981 }
1982
1983 auto *C = dyn_cast<Constant>(Pg);
1984 if (IsAfter && C && C->isNullValue()) {
1985 // The intrinsic is extracting lane 0 so use an extract instead.
1986 auto *IdxTy = Type::getInt64Ty(II.getContext());
1987 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
1988 Extract->insertBefore(II.getIterator());
1989 Extract->takeName(&II);
1990 return IC.replaceInstUsesWith(II, Extract);
1991 }
1992
1993 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1994 if (!IntrPG)
1995 return std::nullopt;
1996
1997 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1998 return std::nullopt;
1999
2000 const auto PTruePattern =
2001 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2002
2003 // Can the intrinsic's predicate be converted to a known constant index?
2004 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2005 if (!MinNumElts)
2006 return std::nullopt;
2007
2008 unsigned Idx = MinNumElts - 1;
2009 // Increment the index if extracting the element after the last active
2010 // predicate element.
2011 if (IsAfter)
2012 ++Idx;
2013
2014 // Ignore extracts whose index is larger than the known minimum vector
2015 // length. NOTE: This is an artificial constraint where we prefer to
2016 // maintain what the user asked for until an alternative is proven faster.
2017 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2018 if (Idx >= PgVTy->getMinNumElements())
2019 return std::nullopt;
2020
2021 // The intrinsic is extracting a fixed lane so use an extract instead.
2022 auto *IdxTy = Type::getInt64Ty(II.getContext());
2023 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2024 Extract->insertBefore(II.getIterator());
2025 Extract->takeName(&II);
2026 return IC.replaceInstUsesWith(II, Extract);
2027}
2028
2029static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2030 IntrinsicInst &II) {
2031 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2032 // integer variant across a variety of micro-architectures. Replace scalar
2033 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2034 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2035 // depending on the micro-architecture, but has been observed as generally
2036 // being faster, particularly when the CLAST[AB] op is a loop-carried
2037 // dependency.
2038 Value *Pg = II.getArgOperand(0);
2039 Value *Fallback = II.getArgOperand(1);
2040 Value *Vec = II.getArgOperand(2);
2041 Type *Ty = II.getType();
2042
2043 if (!Ty->isIntegerTy())
2044 return std::nullopt;
2045
2046 Type *FPTy;
2047 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2048 default:
2049 return std::nullopt;
2050 case 16:
2051 FPTy = IC.Builder.getHalfTy();
2052 break;
2053 case 32:
2054 FPTy = IC.Builder.getFloatTy();
2055 break;
2056 case 64:
2057 FPTy = IC.Builder.getDoubleTy();
2058 break;
2059 }
2060
2061 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2062 auto *FPVTy = VectorType::get(
2063 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2064 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2065 auto *FPII = IC.Builder.CreateIntrinsic(
2066 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2067 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2068 return IC.replaceInstUsesWith(II, FPIItoInt);
2069}
2070
2071static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2072 IntrinsicInst &II) {
2073 LLVMContext &Ctx = II.getContext();
2074 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2075 // can work with RDFFR_PP for ptest elimination.
2076 auto *AllPat =
2077 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2078 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2079 {II.getType()}, {AllPat});
2080 auto *RDFFR =
2081 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});
2082 RDFFR->takeName(&II);
2083 return IC.replaceInstUsesWith(II, RDFFR);
2084}
2085
2086static std::optional<Instruction *>
2088 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2089
2090 if (Pattern == AArch64SVEPredPattern::all) {
2092 II.getType(), ElementCount::getScalable(NumElts));
2093 Cnt->takeName(&II);
2094 return IC.replaceInstUsesWith(II, Cnt);
2095 }
2096
2097 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2098
2099 return MinNumElts && NumElts >= MinNumElts
2100 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2101 II, ConstantInt::get(II.getType(), MinNumElts)))
2102 : std::nullopt;
2103}
2104
2105static std::optional<Instruction *>
2107 const AArch64Subtarget *ST) {
2108 if (!ST->isStreaming())
2109 return std::nullopt;
2110
2111 // In streaming-mode, aarch64_sme_cnts is equivalent to aarch64_sve_cnt
2112 // with SVEPredPattern::all
2114 II.getType(), ElementCount::getScalable(NumElts));
2115 Cnt->takeName(&II);
2116 return IC.replaceInstUsesWith(II, Cnt);
2117}
2118
2119static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2120 IntrinsicInst &II) {
2121 Value *PgVal = II.getArgOperand(0);
2122 Value *OpVal = II.getArgOperand(1);
2123
2124 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2125 // Later optimizations prefer this form.
2126 if (PgVal == OpVal &&
2127 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2128 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2129 Value *Ops[] = {PgVal, OpVal};
2130 Type *Tys[] = {PgVal->getType()};
2131
2132 auto *PTest =
2133 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2134 PTest->takeName(&II);
2135
2136 return IC.replaceInstUsesWith(II, PTest);
2137 }
2138
2141
2142 if (!Pg || !Op)
2143 return std::nullopt;
2144
2145 Intrinsic::ID OpIID = Op->getIntrinsicID();
2146
2147 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2148 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2149 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2150 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2151 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2152
2153 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2154
2155 PTest->takeName(&II);
2156 return IC.replaceInstUsesWith(II, PTest);
2157 }
2158
2159 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2160 // Later optimizations may rewrite sequence to use the flag-setting variant
2161 // of instruction X to remove PTEST.
2162 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2163 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2164 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2165 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2166 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2167 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2168 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2169 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2170 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2171 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2172 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2173 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2174 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2175 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2176 Type *Tys[] = {Pg->getType()};
2177
2178 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2179 PTest->takeName(&II);
2180
2181 return IC.replaceInstUsesWith(II, PTest);
2182 }
2183
2184 return std::nullopt;
2185}
2186
2187template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
2188static std::optional<Instruction *>
2190 bool MergeIntoAddendOp) {
2191 Value *P = II.getOperand(0);
2192 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2193 if (MergeIntoAddendOp) {
2194 AddendOp = II.getOperand(1);
2195 Mul = II.getOperand(2);
2196 } else {
2197 AddendOp = II.getOperand(2);
2198 Mul = II.getOperand(1);
2199 }
2200
2202 m_Value(MulOp1))))
2203 return std::nullopt;
2204
2205 if (!Mul->hasOneUse())
2206 return std::nullopt;
2207
2208 Instruction *FMFSource = nullptr;
2209 if (II.getType()->isFPOrFPVectorTy()) {
2210 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2211 // Stop the combine when the flags on the inputs differ in case dropping
2212 // flags would lead to us missing out on more beneficial optimizations.
2213 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2214 return std::nullopt;
2215 if (!FAddFlags.allowContract())
2216 return std::nullopt;
2217 FMFSource = &II;
2218 }
2219
2220 CallInst *Res;
2221 if (MergeIntoAddendOp)
2222 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2223 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2224 else
2225 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2226 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2227
2228 return IC.replaceInstUsesWith(II, Res);
2229}
2230
2231static std::optional<Instruction *>
2233 Value *Pred = II.getOperand(0);
2234 Value *PtrOp = II.getOperand(1);
2235 Type *VecTy = II.getType();
2236
2237 if (isAllActivePredicate(Pred)) {
2238 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2239 Load->copyMetadata(II);
2240 return IC.replaceInstUsesWith(II, Load);
2241 }
2242
2243 CallInst *MaskedLoad =
2244 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2245 Pred, ConstantAggregateZero::get(VecTy));
2246 MaskedLoad->copyMetadata(II);
2247 return IC.replaceInstUsesWith(II, MaskedLoad);
2248}
2249
2250static std::optional<Instruction *>
2252 Value *VecOp = II.getOperand(0);
2253 Value *Pred = II.getOperand(1);
2254 Value *PtrOp = II.getOperand(2);
2255
2256 if (isAllActivePredicate(Pred)) {
2257 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2258 Store->copyMetadata(II);
2259 return IC.eraseInstFromFunction(II);
2260 }
2261
2262 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2263 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2264 MaskedStore->copyMetadata(II);
2265 return IC.eraseInstFromFunction(II);
2266}
2267
2269 switch (Intrinsic) {
2270 case Intrinsic::aarch64_sve_fmul_u:
2271 return Instruction::BinaryOps::FMul;
2272 case Intrinsic::aarch64_sve_fadd_u:
2273 return Instruction::BinaryOps::FAdd;
2274 case Intrinsic::aarch64_sve_fsub_u:
2275 return Instruction::BinaryOps::FSub;
2276 default:
2277 return Instruction::BinaryOpsEnd;
2278 }
2279}
2280
2281static std::optional<Instruction *>
2283 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2284 if (II.isStrictFP())
2285 return std::nullopt;
2286
2287 auto *OpPredicate = II.getOperand(0);
2288 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2289 if (BinOpCode == Instruction::BinaryOpsEnd ||
2290 !isAllActivePredicate(OpPredicate))
2291 return std::nullopt;
2292 auto BinOp = IC.Builder.CreateBinOpFMF(
2293 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2294 return IC.replaceInstUsesWith(II, BinOp);
2295}
2296
2297static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2298 IntrinsicInst &II) {
2299 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2300 Intrinsic::aarch64_sve_mla>(
2301 IC, II, true))
2302 return MLA;
2303 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2304 Intrinsic::aarch64_sve_mad>(
2305 IC, II, false))
2306 return MAD;
2307 return std::nullopt;
2308}
2309
2310static std::optional<Instruction *>
2312 if (auto FMLA =
2313 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2314 Intrinsic::aarch64_sve_fmla>(IC, II,
2315 true))
2316 return FMLA;
2317 if (auto FMAD =
2318 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2319 Intrinsic::aarch64_sve_fmad>(IC, II,
2320 false))
2321 return FMAD;
2322 if (auto FMLA =
2323 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2324 Intrinsic::aarch64_sve_fmla>(IC, II,
2325 true))
2326 return FMLA;
2327 return std::nullopt;
2328}
2329
2330static std::optional<Instruction *>
2332 if (auto FMLA =
2333 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2334 Intrinsic::aarch64_sve_fmla>(IC, II,
2335 true))
2336 return FMLA;
2337 if (auto FMAD =
2338 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2339 Intrinsic::aarch64_sve_fmad>(IC, II,
2340 false))
2341 return FMAD;
2342 if (auto FMLA_U =
2343 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2344 Intrinsic::aarch64_sve_fmla_u>(
2345 IC, II, true))
2346 return FMLA_U;
2347 return instCombineSVEVectorBinOp(IC, II);
2348}
2349
2350static std::optional<Instruction *>
2352 if (auto FMLS =
2353 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2354 Intrinsic::aarch64_sve_fmls>(IC, II,
2355 true))
2356 return FMLS;
2357 if (auto FMSB =
2358 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2359 Intrinsic::aarch64_sve_fnmsb>(
2360 IC, II, false))
2361 return FMSB;
2362 if (auto FMLS =
2363 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2364 Intrinsic::aarch64_sve_fmls>(IC, II,
2365 true))
2366 return FMLS;
2367 return std::nullopt;
2368}
2369
2370static std::optional<Instruction *>
2372 if (auto FMLS =
2373 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2374 Intrinsic::aarch64_sve_fmls>(IC, II,
2375 true))
2376 return FMLS;
2377 if (auto FMSB =
2378 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2379 Intrinsic::aarch64_sve_fnmsb>(
2380 IC, II, false))
2381 return FMSB;
2382 if (auto FMLS_U =
2383 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2384 Intrinsic::aarch64_sve_fmls_u>(
2385 IC, II, true))
2386 return FMLS_U;
2387 return instCombineSVEVectorBinOp(IC, II);
2388}
2389
2390static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2391 IntrinsicInst &II) {
2392 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2393 Intrinsic::aarch64_sve_mls>(
2394 IC, II, true))
2395 return MLS;
2396 return std::nullopt;
2397}
2398
2399static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2400 IntrinsicInst &II) {
2401 Value *UnpackArg = II.getArgOperand(0);
2402 auto *RetTy = cast<ScalableVectorType>(II.getType());
2403 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2404 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2405
2406 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2407 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2408 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2409 ScalarArg =
2410 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2411 Value *NewVal =
2412 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2413 NewVal->takeName(&II);
2414 return IC.replaceInstUsesWith(II, NewVal);
2415 }
2416
2417 return std::nullopt;
2418}
2419static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2420 IntrinsicInst &II) {
2421 auto *OpVal = II.getOperand(0);
2422 auto *OpIndices = II.getOperand(1);
2423 VectorType *VTy = cast<VectorType>(II.getType());
2424
2425 // Check whether OpIndices is a constant splat value < minimal element count
2426 // of result.
2427 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2428 if (!SplatValue ||
2429 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2430 return std::nullopt;
2431
2432 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2433 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2434 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2435 auto *VectorSplat =
2436 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2437
2438 VectorSplat->takeName(&II);
2439 return IC.replaceInstUsesWith(II, VectorSplat);
2440}
2441
2442static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2443 IntrinsicInst &II) {
2444 Value *A, *B;
2445 Type *RetTy = II.getType();
2446 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2447 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2448
2449 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2450 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2451 if ((match(II.getArgOperand(0),
2453 match(II.getArgOperand(1),
2455 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2456 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2457 auto *TyA = cast<ScalableVectorType>(A->getType());
2458 if (TyA == B->getType() &&
2460 auto *SubVec = IC.Builder.CreateInsertVector(
2461 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2462 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2463 TyA->getMinNumElements());
2464 ConcatVec->takeName(&II);
2465 return IC.replaceInstUsesWith(II, ConcatVec);
2466 }
2467 }
2468
2469 return std::nullopt;
2470}
2471
2472static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2473 IntrinsicInst &II) {
2474 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2475 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2476 Value *A, *B;
2477 if (match(II.getArgOperand(0),
2480 m_Specific(A), m_Specific(B))))
2481 return IC.replaceInstUsesWith(
2482 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2483
2484 return std::nullopt;
2485}
2486
2487static std::optional<Instruction *>
2489 Value *Mask = II.getOperand(0);
2490 Value *BasePtr = II.getOperand(1);
2491 Value *Index = II.getOperand(2);
2492 Type *Ty = II.getType();
2493 Value *PassThru = ConstantAggregateZero::get(Ty);
2494
2495 // Contiguous gather => masked load.
2496 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2497 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2498 Value *IndexBase;
2500 m_Value(IndexBase), m_SpecificInt(1)))) {
2501 Align Alignment =
2502 BasePtr->getPointerAlignment(II.getDataLayout());
2503
2504 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2505 BasePtr, IndexBase);
2506 CallInst *MaskedLoad =
2507 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2508 MaskedLoad->takeName(&II);
2509 return IC.replaceInstUsesWith(II, MaskedLoad);
2510 }
2511
2512 return std::nullopt;
2513}
2514
2515static std::optional<Instruction *>
2517 Value *Val = II.getOperand(0);
2518 Value *Mask = II.getOperand(1);
2519 Value *BasePtr = II.getOperand(2);
2520 Value *Index = II.getOperand(3);
2521 Type *Ty = Val->getType();
2522
2523 // Contiguous scatter => masked store.
2524 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2525 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2526 Value *IndexBase;
2528 m_Value(IndexBase), m_SpecificInt(1)))) {
2529 Align Alignment =
2530 BasePtr->getPointerAlignment(II.getDataLayout());
2531
2532 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2533 BasePtr, IndexBase);
2534 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2535
2536 return IC.eraseInstFromFunction(II);
2537 }
2538
2539 return std::nullopt;
2540}
2541
2542static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2543 IntrinsicInst &II) {
2545 Value *Pred = II.getOperand(0);
2546 Value *Vec = II.getOperand(1);
2547 Value *DivVec = II.getOperand(2);
2548
2549 Value *SplatValue = getSplatValue(DivVec);
2550 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2551 if (!SplatConstantInt)
2552 return std::nullopt;
2553
2554 APInt Divisor = SplatConstantInt->getValue();
2555 const int64_t DivisorValue = Divisor.getSExtValue();
2556 if (DivisorValue == -1)
2557 return std::nullopt;
2558 if (DivisorValue == 1)
2559 IC.replaceInstUsesWith(II, Vec);
2560
2561 if (Divisor.isPowerOf2()) {
2562 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2563 auto ASRD = IC.Builder.CreateIntrinsic(
2564 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2565 return IC.replaceInstUsesWith(II, ASRD);
2566 }
2567 if (Divisor.isNegatedPowerOf2()) {
2568 Divisor.negate();
2569 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2570 auto ASRD = IC.Builder.CreateIntrinsic(
2571 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2572 auto NEG = IC.Builder.CreateIntrinsic(
2573 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2574 return IC.replaceInstUsesWith(II, NEG);
2575 }
2576
2577 return std::nullopt;
2578}
2579
2580bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2581 size_t VecSize = Vec.size();
2582 if (VecSize == 1)
2583 return true;
2584 if (!isPowerOf2_64(VecSize))
2585 return false;
2586 size_t HalfVecSize = VecSize / 2;
2587
2588 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2589 RHS != Vec.end(); LHS++, RHS++) {
2590 if (*LHS != nullptr && *RHS != nullptr) {
2591 if (*LHS == *RHS)
2592 continue;
2593 else
2594 return false;
2595 }
2596 if (!AllowPoison)
2597 return false;
2598 if (*LHS == nullptr && *RHS != nullptr)
2599 *LHS = *RHS;
2600 }
2601
2602 Vec.resize(HalfVecSize);
2603 SimplifyValuePattern(Vec, AllowPoison);
2604 return true;
2605}
2606
2607// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2608// to dupqlane(f64(C)) where C is A concatenated with B
2609static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2610 IntrinsicInst &II) {
2611 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2612 if (!match(II.getOperand(0),
2614 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2615 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2616 return std::nullopt;
2617 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2618
2619 // Insert the scalars into a container ordered by InsertElement index
2620 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2621 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2622 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2623 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2624 CurrentInsertElt = InsertElt->getOperand(0);
2625 }
2626
2627 bool AllowPoison =
2628 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2629 if (!SimplifyValuePattern(Elts, AllowPoison))
2630 return std::nullopt;
2631
2632 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2633 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2634 for (size_t I = 0; I < Elts.size(); I++) {
2635 if (Elts[I] == nullptr)
2636 continue;
2637 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2638 IC.Builder.getInt64(I));
2639 }
2640 if (InsertEltChain == nullptr)
2641 return std::nullopt;
2642
2643 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2644 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2645 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2646 // be narrowed back to the original type.
2647 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2648 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2649 IIScalableTy->getMinNumElements() /
2650 PatternWidth;
2651
2652 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2653 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2654 auto *WideShuffleMaskTy =
2655 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2656
2657 auto InsertSubvector = IC.Builder.CreateInsertVector(
2658 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2659 uint64_t(0));
2660 auto WideBitcast =
2661 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2662 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2663 auto WideShuffle = IC.Builder.CreateShuffleVector(
2664 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2665 auto NarrowBitcast =
2666 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2667
2668 return IC.replaceInstUsesWith(II, NarrowBitcast);
2669}
2670
2671static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2672 IntrinsicInst &II) {
2673 Value *A = II.getArgOperand(0);
2674 Value *B = II.getArgOperand(1);
2675 if (A == B)
2676 return IC.replaceInstUsesWith(II, A);
2677
2678 return std::nullopt;
2679}
2680
2681static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2682 IntrinsicInst &II) {
2683 Value *Pred = II.getOperand(0);
2684 Value *Vec = II.getOperand(1);
2685 Value *Shift = II.getOperand(2);
2686
2687 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2688 Value *AbsPred, *MergedValue;
2690 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2692 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2693
2694 return std::nullopt;
2695
2696 // Transform is valid if any of the following are true:
2697 // * The ABS merge value is an undef or non-negative
2698 // * The ABS predicate is all active
2699 // * The ABS predicate and the SRSHL predicates are the same
2700 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2701 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2702 return std::nullopt;
2703
2704 // Only valid when the shift amount is non-negative, otherwise the rounding
2705 // behaviour of SRSHL cannot be ignored.
2706 if (!match(Shift, m_NonNegative()))
2707 return std::nullopt;
2708
2709 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2710 {II.getType()}, {Pred, Vec, Shift});
2711
2712 return IC.replaceInstUsesWith(II, LSL);
2713}
2714
2715static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2716 IntrinsicInst &II) {
2717 Value *Vec = II.getOperand(0);
2718
2719 if (getSplatValue(Vec) == II.getOperand(1))
2720 return IC.replaceInstUsesWith(II, Vec);
2721
2722 return std::nullopt;
2723}
2724
2725static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2726 IntrinsicInst &II) {
2727 // If this barrier is post-dominated by identical one we can remove it
2728 auto *NI = II.getNextNode();
2729 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2730 auto CanSkipOver = [](Instruction *I) {
2731 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2732 };
2733 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2734 auto *NIBB = NI->getParent();
2735 NI = NI->getNextNode();
2736 if (!NI) {
2737 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2738 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2739 else
2740 break;
2741 }
2742 }
2743 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2744 if (NextII && II.isIdenticalTo(NextII))
2745 return IC.eraseInstFromFunction(II);
2746
2747 return std::nullopt;
2748}
2749
2750static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2751 IntrinsicInst &II) {
2753 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2754 return std::nullopt;
2755}
2756
2757static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2759 unsigned NumBits) {
2760 Value *Passthru = II.getOperand(0);
2761 Value *Pg = II.getOperand(1);
2762 Value *Op = II.getOperand(2);
2763
2764 // Convert UXT[BHW] to AND.
2765 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2766 auto *Ty = cast<VectorType>(II.getType());
2767 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2768 auto *Mask = ConstantInt::get(Ty, MaskValue);
2769 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2770 {Pg, Op, Mask});
2771 return IC.replaceInstUsesWith(II, And);
2772 }
2773
2774 return std::nullopt;
2775}
2776
2777static std::optional<Instruction *>
2779 SMEAttrs FnSMEAttrs(*II.getFunction());
2780 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2781 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
2782 return IC.replaceInstUsesWith(
2783 II, ConstantInt::getBool(II.getType(), IsStreaming));
2784 return std::nullopt;
2785}
2786
2787std::optional<Instruction *>
2789 IntrinsicInst &II) const {
2791 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2792 return I;
2793
2794 Intrinsic::ID IID = II.getIntrinsicID();
2795 switch (IID) {
2796 default:
2797 break;
2798 case Intrinsic::aarch64_dmb:
2799 return instCombineDMB(IC, II);
2800 case Intrinsic::aarch64_neon_fmaxnm:
2801 case Intrinsic::aarch64_neon_fminnm:
2802 return instCombineMaxMinNM(IC, II);
2803 case Intrinsic::aarch64_sve_convert_from_svbool:
2804 return instCombineConvertFromSVBool(IC, II);
2805 case Intrinsic::aarch64_sve_dup:
2806 return instCombineSVEDup(IC, II);
2807 case Intrinsic::aarch64_sve_dup_x:
2808 return instCombineSVEDupX(IC, II);
2809 case Intrinsic::aarch64_sve_cmpne:
2810 case Intrinsic::aarch64_sve_cmpne_wide:
2811 return instCombineSVECmpNE(IC, II);
2812 case Intrinsic::aarch64_sve_rdffr:
2813 return instCombineRDFFR(IC, II);
2814 case Intrinsic::aarch64_sve_lasta:
2815 case Intrinsic::aarch64_sve_lastb:
2816 return instCombineSVELast(IC, II);
2817 case Intrinsic::aarch64_sve_clasta_n:
2818 case Intrinsic::aarch64_sve_clastb_n:
2819 return instCombineSVECondLast(IC, II);
2820 case Intrinsic::aarch64_sve_cntd:
2821 return instCombineSVECntElts(IC, II, 2);
2822 case Intrinsic::aarch64_sve_cntw:
2823 return instCombineSVECntElts(IC, II, 4);
2824 case Intrinsic::aarch64_sve_cnth:
2825 return instCombineSVECntElts(IC, II, 8);
2826 case Intrinsic::aarch64_sve_cntb:
2827 return instCombineSVECntElts(IC, II, 16);
2828 case Intrinsic::aarch64_sme_cntsd:
2829 return instCombineSMECntsElts(IC, II, 2, ST);
2830 case Intrinsic::aarch64_sme_cntsw:
2831 return instCombineSMECntsElts(IC, II, 4, ST);
2832 case Intrinsic::aarch64_sme_cntsh:
2833 return instCombineSMECntsElts(IC, II, 8, ST);
2834 case Intrinsic::aarch64_sme_cntsb:
2835 return instCombineSMECntsElts(IC, II, 16, ST);
2836 case Intrinsic::aarch64_sve_ptest_any:
2837 case Intrinsic::aarch64_sve_ptest_first:
2838 case Intrinsic::aarch64_sve_ptest_last:
2839 return instCombineSVEPTest(IC, II);
2840 case Intrinsic::aarch64_sve_fadd:
2841 return instCombineSVEVectorFAdd(IC, II);
2842 case Intrinsic::aarch64_sve_fadd_u:
2843 return instCombineSVEVectorFAddU(IC, II);
2844 case Intrinsic::aarch64_sve_fmul_u:
2845 return instCombineSVEVectorBinOp(IC, II);
2846 case Intrinsic::aarch64_sve_fsub:
2847 return instCombineSVEVectorFSub(IC, II);
2848 case Intrinsic::aarch64_sve_fsub_u:
2849 return instCombineSVEVectorFSubU(IC, II);
2850 case Intrinsic::aarch64_sve_add:
2851 return instCombineSVEVectorAdd(IC, II);
2852 case Intrinsic::aarch64_sve_add_u:
2853 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2854 Intrinsic::aarch64_sve_mla_u>(
2855 IC, II, true);
2856 case Intrinsic::aarch64_sve_sub:
2857 return instCombineSVEVectorSub(IC, II);
2858 case Intrinsic::aarch64_sve_sub_u:
2859 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2860 Intrinsic::aarch64_sve_mls_u>(
2861 IC, II, true);
2862 case Intrinsic::aarch64_sve_tbl:
2863 return instCombineSVETBL(IC, II);
2864 case Intrinsic::aarch64_sve_uunpkhi:
2865 case Intrinsic::aarch64_sve_uunpklo:
2866 case Intrinsic::aarch64_sve_sunpkhi:
2867 case Intrinsic::aarch64_sve_sunpklo:
2868 return instCombineSVEUnpack(IC, II);
2869 case Intrinsic::aarch64_sve_uzp1:
2870 return instCombineSVEUzp1(IC, II);
2871 case Intrinsic::aarch64_sve_zip1:
2872 case Intrinsic::aarch64_sve_zip2:
2873 return instCombineSVEZip(IC, II);
2874 case Intrinsic::aarch64_sve_ld1_gather_index:
2875 return instCombineLD1GatherIndex(IC, II);
2876 case Intrinsic::aarch64_sve_st1_scatter_index:
2877 return instCombineST1ScatterIndex(IC, II);
2878 case Intrinsic::aarch64_sve_ld1:
2879 return instCombineSVELD1(IC, II, DL);
2880 case Intrinsic::aarch64_sve_st1:
2881 return instCombineSVEST1(IC, II, DL);
2882 case Intrinsic::aarch64_sve_sdiv:
2883 return instCombineSVESDIV(IC, II);
2884 case Intrinsic::aarch64_sve_sel:
2885 return instCombineSVESel(IC, II);
2886 case Intrinsic::aarch64_sve_srshl:
2887 return instCombineSVESrshl(IC, II);
2888 case Intrinsic::aarch64_sve_dupq_lane:
2889 return instCombineSVEDupqLane(IC, II);
2890 case Intrinsic::aarch64_sve_insr:
2891 return instCombineSVEInsr(IC, II);
2892 case Intrinsic::aarch64_sve_ptrue:
2893 return instCombinePTrue(IC, II);
2894 case Intrinsic::aarch64_sve_uxtb:
2895 return instCombineSVEUxt(IC, II, 8);
2896 case Intrinsic::aarch64_sve_uxth:
2897 return instCombineSVEUxt(IC, II, 16);
2898 case Intrinsic::aarch64_sve_uxtw:
2899 return instCombineSVEUxt(IC, II, 32);
2900 case Intrinsic::aarch64_sme_in_streaming_mode:
2901 return instCombineInStreamingMode(IC, II);
2902 }
2903
2904 return std::nullopt;
2905}
2906
2908 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2909 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2910 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2911 SimplifyAndSetOp) const {
2912 switch (II.getIntrinsicID()) {
2913 default:
2914 break;
2915 case Intrinsic::aarch64_neon_fcvtxn:
2916 case Intrinsic::aarch64_neon_rshrn:
2917 case Intrinsic::aarch64_neon_sqrshrn:
2918 case Intrinsic::aarch64_neon_sqrshrun:
2919 case Intrinsic::aarch64_neon_sqshrn:
2920 case Intrinsic::aarch64_neon_sqshrun:
2921 case Intrinsic::aarch64_neon_sqxtn:
2922 case Intrinsic::aarch64_neon_sqxtun:
2923 case Intrinsic::aarch64_neon_uqrshrn:
2924 case Intrinsic::aarch64_neon_uqshrn:
2925 case Intrinsic::aarch64_neon_uqxtn:
2926 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2927 break;
2928 }
2929
2930 return std::nullopt;
2931}
2932
2934 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2936}
2937
2940 switch (K) {
2942 return TypeSize::getFixed(64);
2944 if (ST->useSVEForFixedLengthVectors() &&
2945 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
2946 return TypeSize::getFixed(
2947 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2948 else if (ST->isNeonAvailable())
2949 return TypeSize::getFixed(128);
2950 else
2951 return TypeSize::getFixed(0);
2953 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2955 return TypeSize::getScalable(128);
2956 else
2957 return TypeSize::getScalable(0);
2958 }
2959 llvm_unreachable("Unsupported register kind");
2960}
2961
2962bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
2964 Type *SrcOverrideTy) const {
2965 // A helper that returns a vector type from the given type. The number of
2966 // elements in type Ty determines the vector width.
2967 auto toVectorTy = [&](Type *ArgTy) {
2968 return VectorType::get(ArgTy->getScalarType(),
2969 cast<VectorType>(DstTy)->getElementCount());
2970 };
2971
2972 // Exit early if DstTy is not a vector type whose elements are one of [i16,
2973 // i32, i64]. SVE doesn't generally have the same set of instructions to
2974 // perform an extend with the add/sub/mul. There are SMULLB style
2975 // instructions, but they operate on top/bottom, requiring some sort of lane
2976 // interleaving to be used with zext/sext.
2977 unsigned DstEltSize = DstTy->getScalarSizeInBits();
2978 if (!useNeonVector(DstTy) || Args.size() != 2 ||
2979 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2980 return false;
2981
2982 // Determine if the operation has a widening variant. We consider both the
2983 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2984 // instructions.
2985 //
2986 // TODO: Add additional widening operations (e.g., shl, etc.) once we
2987 // verify that their extending operands are eliminated during code
2988 // generation.
2989 Type *SrcTy = SrcOverrideTy;
2990 switch (Opcode) {
2991 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2992 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2993 // The second operand needs to be an extend
2994 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2995 if (!SrcTy)
2996 SrcTy =
2997 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
2998 } else
2999 return false;
3000 break;
3001 case Instruction::Mul: { // SMULL(2), UMULL(2)
3002 // Both operands need to be extends of the same type.
3003 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3004 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3005 if (!SrcTy)
3006 SrcTy =
3007 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3008 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
3009 // If one of the operands is a Zext and the other has enough zero bits to
3010 // be treated as unsigned, we can still general a umull, meaning the zext
3011 // is free.
3012 KnownBits Known =
3013 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3014 if (Args[0]->getType()->getScalarSizeInBits() -
3015 Known.Zero.countLeadingOnes() >
3016 DstTy->getScalarSizeInBits() / 2)
3017 return false;
3018 if (!SrcTy)
3019 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
3020 DstTy->getScalarSizeInBits() / 2));
3021 } else
3022 return false;
3023 break;
3024 }
3025 default:
3026 return false;
3027 }
3028
3029 // Legalize the destination type and ensure it can be used in a widening
3030 // operation.
3031 auto DstTyL = getTypeLegalizationCost(DstTy);
3032 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3033 return false;
3034
3035 // Legalize the source type and ensure it can be used in a widening
3036 // operation.
3037 assert(SrcTy && "Expected some SrcTy");
3038 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3039 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3040 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3041 return false;
3042
3043 // Get the total number of vector elements in the legalized types.
3044 InstructionCost NumDstEls =
3045 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3046 InstructionCost NumSrcEls =
3047 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3048
3049 // Return true if the legalized types have the same number of vector elements
3050 // and the destination element type size is twice that of the source type.
3051 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3052}
3053
3054// s/urhadd instructions implement the following pattern, making the
3055// extends free:
3056// %x = add ((zext i8 -> i16), 1)
3057// %y = (zext i8 -> i16)
3058// trunc i16 (lshr (add %x, %y), 1) -> i8
3059//
3061 Type *Src) const {
3062 // The source should be a legal vector type.
3063 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3064 (Src->isScalableTy() && !ST->hasSVE2()))
3065 return false;
3066
3067 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3068 return false;
3069
3070 // Look for trunc/shl/add before trying to match the pattern.
3071 const Instruction *Add = ExtUser;
3072 auto *AddUser =
3073 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3074 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3075 Add = AddUser;
3076
3077 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3078 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3079 return false;
3080
3081 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3082 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3083 Src->getScalarSizeInBits() !=
3084 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3085 return false;
3086
3087 // Try to match the whole pattern. Ext could be either the first or second
3088 // m_ZExtOrSExt matched.
3089 Instruction *Ex1, *Ex2;
3090 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3091 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
3092 return false;
3093
3094 // Ensure both extends are of the same type
3095 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3096 Ex1->getOpcode() == Ex2->getOpcode())
3097 return true;
3098
3099 return false;
3100}
3101
3103 Type *Src,
3106 const Instruction *I) const {
3107 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3108 assert(ISD && "Invalid opcode");
3109 // If the cast is observable, and it is used by a widening instruction (e.g.,
3110 // uaddl, saddw, etc.), it may be free.
3111 if (I && I->hasOneUser()) {
3112 auto *SingleUser = cast<Instruction>(*I->user_begin());
3113 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3114 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
3115 // For adds only count the second operand as free if both operands are
3116 // extends but not the same operation. (i.e both operands are not free in
3117 // add(sext, zext)).
3118 if (SingleUser->getOpcode() == Instruction::Add) {
3119 if (I == SingleUser->getOperand(1) ||
3120 (isa<CastInst>(SingleUser->getOperand(1)) &&
3121 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3122 return 0;
3123 } else // Others are free so long as isWideningInstruction returned true.
3124 return 0;
3125 }
3126
3127 // The cast will be free for the s/urhadd instructions
3128 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3129 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3130 return 0;
3131 }
3132
3133 // TODO: Allow non-throughput costs that aren't binary.
3134 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
3136 return Cost == 0 ? 0 : 1;
3137 return Cost;
3138 };
3139
3140 EVT SrcTy = TLI->getValueType(DL, Src);
3141 EVT DstTy = TLI->getValueType(DL, Dst);
3142
3143 if (!SrcTy.isSimple() || !DstTy.isSimple())
3144 return AdjustCost(
3145 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3146
3147 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3148 // we use fcvtx under SVE2. Give them invalid costs.
3149 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3150 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3151 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3153
3154 static const TypeConversionCostTblEntry BF16Tbl[] = {
3155 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3156 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3157 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3158 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3159 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3160 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3161 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3162 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3163 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3164 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3165 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3166 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3167 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3168 };
3169
3170 if (ST->hasBF16())
3171 if (const auto *Entry = ConvertCostTableLookup(
3172 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3173 return AdjustCost(Entry->Cost);
3174
3175 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3176 // The cost of unpacking twice is artificially increased for now in order
3177 // to avoid regressions against NEON, which will use tbl instructions directly
3178 // instead of multiple layers of [s|u]unpk[lo|hi].
3179 // We use the unpacks in cases where the destination type is illegal and
3180 // requires splitting of the input, even if the input type itself is legal.
3181 const unsigned int SVE_EXT_COST = 1;
3182 const unsigned int SVE_FCVT_COST = 1;
3183 const unsigned int SVE_UNPACK_ONCE = 4;
3184 const unsigned int SVE_UNPACK_TWICE = 16;
3185
3186 static const TypeConversionCostTblEntry ConversionTbl[] = {
3187 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3188 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3189 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3190 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3191 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3192 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3193 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3194 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3195 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3196 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3197 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3198 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3199 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3200 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3201 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3202 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3203 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3204 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3205 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3206 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3207
3208 // Truncations on nxvmiN
3209 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3210 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3211 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3212 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3213 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3214 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3215 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3216 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3217 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3218 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3219 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3220 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3221 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3222 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3223 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3224 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3225 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3226 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3227 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3228 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3229 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3230 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3231 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3232 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3233 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3234 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3235 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3236 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3237 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3238 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3239 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3240 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3241 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3242
3243 // The number of shll instructions for the extension.
3244 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3245 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3246 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3247 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3248 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3249 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3250 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3251 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3252 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3253 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3254 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3255 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3256 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3257 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3258 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3259 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3260
3261 // FP Ext and trunc
3262 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3263 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3264 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3265 // FP16
3266 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3267 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3268 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3269 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3270 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3271 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3272 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3273 // BF16 (uses shift)
3274 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3275 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3276 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3277 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3278 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3279 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3280 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3281 // FP Ext and trunc
3282 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3283 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3284 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3285 // FP16
3286 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3287 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3288 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3289 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3290 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3291 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3292 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3293 // BF16 (more complex, with +bf16 is handled above)
3294 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3295 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3296 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3297 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3298 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3299 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3300 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3301 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3302
3303 // LowerVectorINT_TO_FP:
3304 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3305 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3306 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3307 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3308 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3309 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3310
3311 // SVE: to nxv2f16
3312 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3313 SVE_EXT_COST + SVE_FCVT_COST},
3314 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3315 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3316 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3317 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3318 SVE_EXT_COST + SVE_FCVT_COST},
3319 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3320 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3321 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3322
3323 // SVE: to nxv4f16
3324 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3325 SVE_EXT_COST + SVE_FCVT_COST},
3326 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3327 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3328 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3329 SVE_EXT_COST + SVE_FCVT_COST},
3330 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3331 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3332
3333 // SVE: to nxv8f16
3334 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3335 SVE_EXT_COST + SVE_FCVT_COST},
3336 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3337 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3338 SVE_EXT_COST + SVE_FCVT_COST},
3339 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3340
3341 // SVE: to nxv16f16
3342 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3343 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3344 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3345 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3346
3347 // Complex: to v2f32
3348 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3349 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3350 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3351 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3352
3353 // SVE: to nxv2f32
3354 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3355 SVE_EXT_COST + SVE_FCVT_COST},
3356 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3357 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3358 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3359 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3360 SVE_EXT_COST + SVE_FCVT_COST},
3361 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3362 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3363 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3364
3365 // Complex: to v4f32
3366 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3367 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3368 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3369 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3370
3371 // SVE: to nxv4f32
3372 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3373 SVE_EXT_COST + SVE_FCVT_COST},
3374 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3375 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3376 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3377 SVE_EXT_COST + SVE_FCVT_COST},
3378 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3379 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3380
3381 // Complex: to v8f32
3382 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3383 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3384 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3385 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3386
3387 // SVE: to nxv8f32
3388 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3389 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3390 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3391 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3392 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3393 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3394 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3395 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3396
3397 // SVE: to nxv16f32
3398 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3399 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3400 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3401 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3402
3403 // Complex: to v16f32
3404 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3405 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3406
3407 // Complex: to v2f64
3408 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3409 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3410 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3411 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3412 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3413 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3414
3415 // SVE: to nxv2f64
3416 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3417 SVE_EXT_COST + SVE_FCVT_COST},
3418 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3419 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3420 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3421 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3422 SVE_EXT_COST + SVE_FCVT_COST},
3423 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3424 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3425 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3426
3427 // Complex: to v4f64
3428 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3429 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3430
3431 // SVE: to nxv4f64
3432 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3433 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3434 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3435 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3436 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3437 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3438 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3439 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3440 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3441 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3442 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3443 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3444
3445 // SVE: to nxv8f64
3446 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3447 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3448 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3449 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3450 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3451 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3452 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3453 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3454
3455 // LowerVectorFP_TO_INT
3456 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3457 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3458 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3459 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3460 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3461 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3462
3463 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3464 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3465 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3466 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3467 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3468 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3469 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3470
3471 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3472 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3473 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3474 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3475 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3476
3477 // Complex, from nxv2f32.
3478 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3479 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3480 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3481 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3482 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3483 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3484 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3485 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3486
3487 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3488 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3489 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3490 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3491 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3492 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3493 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3494
3495 // Complex, from nxv2f64.
3496 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3497 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3498 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3499 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3500 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3501 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3502 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3503 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3504 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3505 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3506
3507 // Complex, from nxv4f32.
3508 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3509 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3510 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3511 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3512 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3513 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3514 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3515 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3516 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3517 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3518
3519 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3520 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3521 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3522 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3523 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3524
3525 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3526 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3527 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3528 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3529 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3530 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3531 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3532
3533 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3534 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3535 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3536 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3537 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3538
3539 // Complex, from nxv8f16.
3540 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3541 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3542 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3543 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3544 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3545 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3546 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3547 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3548 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3549 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3550
3551 // Complex, from nxv4f16.
3552 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3553 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3554 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3555 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3556 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3557 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3558 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3559 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3560
3561 // Complex, from nxv2f16.
3562 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3563 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3564 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3565 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3566 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3567 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3568 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3569 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3570
3571 // Truncate from nxvmf32 to nxvmf16.
3572 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3573 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3574 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3575
3576 // Truncate from nxvmf32 to nxvmbf16.
3577 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3578 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3579 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3580
3581 // Truncate from nxvmf64 to nxvmf16.
3582 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3583 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3584 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3585
3586 // Truncate from nxvmf64 to nxvmbf16.
3587 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3588 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3589 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3590
3591 // Truncate from nxvmf64 to nxvmf32.
3592 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3593 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3594 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3595
3596 // Extend from nxvmf16 to nxvmf32.
3597 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3598 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3599 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3600
3601 // Extend from nxvmbf16 to nxvmf32.
3602 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
3603 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
3604 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
3605
3606 // Extend from nxvmf16 to nxvmf64.
3607 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3608 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3609 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3610
3611 // Extend from nxvmbf16 to nxvmf64.
3612 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
3613 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
3614 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
3615
3616 // Extend from nxvmf32 to nxvmf64.
3617 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3618 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3619 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3620
3621 // Bitcasts from float to integer
3622 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3623 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3624 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3625
3626 // Bitcasts from integer to float
3627 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3628 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3629 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3630
3631 // Add cost for extending to illegal -too wide- scalable vectors.
3632 // zero/sign extend are implemented by multiple unpack operations,
3633 // where each operation has a cost of 1.
3634 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3635 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3636 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3637 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3638 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3639 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3640
3641 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3642 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3643 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3644 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3645 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3646 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3647 };
3648
3649 // We have to estimate a cost of fixed length operation upon
3650 // SVE registers(operations) with the number of registers required
3651 // for a fixed type to be represented upon SVE registers.
3652 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3653 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3654 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3655 ST->useSVEForFixedLengthVectors(WiderTy)) {
3656 std::pair<InstructionCost, MVT> LT =
3657 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3658 unsigned NumElements =
3659 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3660 return AdjustCost(
3661 LT.first *
3663 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
3664 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3665 CostKind, I));
3666 }
3667
3668 if (const auto *Entry = ConvertCostTableLookup(
3669 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3670 return AdjustCost(Entry->Cost);
3671
3672 static const TypeConversionCostTblEntry FP16Tbl[] = {
3673 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3674 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3675 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3676 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3677 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3678 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3679 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3680 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3681 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3682 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3683 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3684 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3685 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3686 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3687 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3688 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3689 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3690 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3691 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3692 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3693 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3694 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3695 };
3696
3697 if (ST->hasFullFP16())
3698 if (const auto *Entry = ConvertCostTableLookup(
3699 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3700 return AdjustCost(Entry->Cost);
3701
3702 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3703 // double-rounding issues.
3704 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3705 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3707 return AdjustCost(
3709 getCastInstrCost(Opcode, Dst->getScalarType(), Src->getScalarType(),
3710 CCH, CostKind) +
3712 CostKind) +
3714 CostKind));
3715
3716 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3718 ST->isSVEorStreamingSVEAvailable() &&
3719 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3721 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3723 // The standard behaviour in the backend for these cases is to split the
3724 // extend up into two parts:
3725 // 1. Perform an extending load or masked load up to the legal type.
3726 // 2. Extend the loaded data to the final type.
3727 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3728 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3730 Opcode, LegalTy, Src, CCH, CostKind, I);
3732 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3733 return Part1 + Part2;
3734 }
3735
3736 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3737 // but we also want to include the TTI::CastContextHint::Masked case too.
3738 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3740 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3742
3743 return AdjustCost(
3744 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3745}
3746
3749 VectorType *VecTy, unsigned Index,
3751
3752 // Make sure we were given a valid extend opcode.
3753 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3754 "Invalid opcode");
3755
3756 // We are extending an element we extract from a vector, so the source type
3757 // of the extend is the element type of the vector.
3758 auto *Src = VecTy->getElementType();
3759
3760 // Sign- and zero-extends are for integer types only.
3761 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3762
3763 // Get the cost for the extract. We compute the cost (if any) for the extend
3764 // below.
3765 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3766 CostKind, Index, nullptr, nullptr);
3767
3768 // Legalize the types.
3769 auto VecLT = getTypeLegalizationCost(VecTy);
3770 auto DstVT = TLI->getValueType(DL, Dst);
3771 auto SrcVT = TLI->getValueType(DL, Src);
3772
3773 // If the resulting type is still a vector and the destination type is legal,
3774 // we may get the extension for free. If not, get the default cost for the
3775 // extend.
3776 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3777 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3778 CostKind);
3779
3780 // The destination type should be larger than the element type. If not, get
3781 // the default cost for the extend.
3782 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3783 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3784 CostKind);
3785
3786 switch (Opcode) {
3787 default:
3788 llvm_unreachable("Opcode should be either SExt or ZExt");
3789
3790 // For sign-extends, we only need a smov, which performs the extension
3791 // automatically.
3792 case Instruction::SExt:
3793 return Cost;
3794
3795 // For zero-extends, the extend is performed automatically by a umov unless
3796 // the destination type is i64 and the element type is i8 or i16.
3797 case Instruction::ZExt:
3798 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3799 return Cost;
3800 }
3801
3802 // If we are unable to perform the extend for free, get the default cost.
3803 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3804 CostKind);
3805}
3806
3809 const Instruction *I) const {
3811 return Opcode == Instruction::PHI ? 0 : 1;
3812 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3813 // Branches are assumed to be predicted.
3814 return 0;
3815}
3816
3817InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3818 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3819 const Instruction *I, Value *Scalar,
3820 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
3821 assert(Val->isVectorTy() && "This must be a vector type");
3822
3823 if (Index != -1U) {
3824 // Legalize the type.
3825 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3826
3827 // This type is legalized to a scalar type.
3828 if (!LT.second.isVector())
3829 return 0;
3830
3831 // The type may be split. For fixed-width vectors we can normalize the
3832 // index to the new type.
3833 if (LT.second.isFixedLengthVector()) {
3834 unsigned Width = LT.second.getVectorNumElements();
3835 Index = Index % Width;
3836 }
3837
3838 // The element at index zero is already inside the vector.
3839 // - For a insert-element or extract-element
3840 // instruction that extracts integers, an explicit FPR -> GPR move is
3841 // needed. So it has non-zero cost.
3842 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
3843 return 0;
3844
3845 // This is recognising a LD1 single-element structure to one lane of one
3846 // register instruction. I.e., if this is an `insertelement` instruction,
3847 // and its second operand is a load, then we will generate a LD1, which
3848 // are expensive instructions.
3849 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
3850 return CostKind == TTI::TCK_CodeSize
3851 ? 0
3853
3854 // i1 inserts and extract will include an extra cset or cmp of the vector
3855 // value. Increase the cost by 1 to account.
3856 if (Val->getScalarSizeInBits() == 1)
3857 return CostKind == TTI::TCK_CodeSize
3858 ? 2
3860
3861 // FIXME:
3862 // If the extract-element and insert-element instructions could be
3863 // simplified away (e.g., could be combined into users by looking at use-def
3864 // context), they have no cost. This is not done in the first place for
3865 // compile-time considerations.
3866 }
3867
3868 // In case of Neon, if there exists extractelement from lane != 0 such that
3869 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
3870 // 2. extractelement result feeds into fmul.
3871 // 3. Other operand of fmul is an extractelement from lane 0 or lane
3872 // equivalent to 0.
3873 // then the extractelement can be merged with fmul in the backend and it
3874 // incurs no cost.
3875 // e.g.
3876 // define double @foo(<2 x double> %a) {
3877 // %1 = extractelement <2 x double> %a, i32 0
3878 // %2 = extractelement <2 x double> %a, i32 1
3879 // %res = fmul double %1, %2
3880 // ret double %res
3881 // }
3882 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
3883 auto ExtractCanFuseWithFmul = [&]() {
3884 // We bail out if the extract is from lane 0.
3885 if (Index == 0)
3886 return false;
3887
3888 // Check if the scalar element type of the vector operand of ExtractElement
3889 // instruction is one of the allowed types.
3890 auto IsAllowedScalarTy = [&](const Type *T) {
3891 return T->isFloatTy() || T->isDoubleTy() ||
3892 (T->isHalfTy() && ST->hasFullFP16());
3893 };
3894
3895 // Check if the extractelement user is scalar fmul.
3896 auto IsUserFMulScalarTy = [](const Value *EEUser) {
3897 // Check if the user is scalar fmul.
3898 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
3899 return BO && BO->getOpcode() == BinaryOperator::FMul &&
3900 !BO->getType()->isVectorTy();
3901 };
3902
3903 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
3904 // certain scalar type and a certain vector register width.
3905 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
3906 auto RegWidth =
3908 .getFixedValue();
3909 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
3910 };
3911
3912 // Check if the type constraints on input vector type and result scalar type
3913 // of extractelement instruction are satisfied.
3914 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
3915 return false;
3916
3917 if (Scalar) {
3918 DenseMap<User *, unsigned> UserToExtractIdx;
3919 for (auto *U : Scalar->users()) {
3920 if (!IsUserFMulScalarTy(U))
3921 return false;
3922 // Recording entry for the user is important. Index value is not
3923 // important.
3924 UserToExtractIdx[U];
3925 }
3926 if (UserToExtractIdx.empty())
3927 return false;
3928 for (auto &[S, U, L] : ScalarUserAndIdx) {
3929 for (auto *U : S->users()) {
3930 if (UserToExtractIdx.contains(U)) {
3931 auto *FMul = cast<BinaryOperator>(U);
3932 auto *Op0 = FMul->getOperand(0);
3933 auto *Op1 = FMul->getOperand(1);
3934 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
3935 UserToExtractIdx[U] = L;
3936 break;
3937 }
3938 }
3939 }
3940 }
3941 for (auto &[U, L] : UserToExtractIdx) {
3942 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
3943 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
3944 return false;
3945 }
3946 } else {
3947 const auto *EE = cast<ExtractElementInst>(I);
3948
3949 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
3950 if (!IdxOp)
3951 return false;
3952
3953 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
3954 if (!IsUserFMulScalarTy(U))
3955 return false;
3956
3957 // Check if the other operand of extractelement is also extractelement
3958 // from lane equivalent to 0.
3959 const auto *BO = cast<BinaryOperator>(U);
3960 const auto *OtherEE = dyn_cast<ExtractElementInst>(
3961 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
3962 if (OtherEE) {
3963 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
3964 if (!IdxOp)
3965 return false;
3966 return IsExtractLaneEquivalentToZero(
3967 cast<ConstantInt>(OtherEE->getIndexOperand())
3968 ->getValue()
3969 .getZExtValue(),
3970 OtherEE->getType()->getScalarSizeInBits());
3971 }
3972 return true;
3973 });
3974 }
3975 return true;
3976 };
3977
3978 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
3979 ExtractCanFuseWithFmul())
3980 return 0;
3981
3982 // All other insert/extracts cost this much.
3983 return CostKind == TTI::TCK_CodeSize ? 1
3984 : ST->getVectorInsertExtractBaseCost();
3985}
3986
3989 unsigned Index,
3990 const Value *Op0,
3991 const Value *Op1) const {
3992 // Treat insert at lane 0 into a poison vector as having zero cost. This
3993 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
3994 // single dup) are treated as cheap.
3995 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
3996 isa<PoisonValue>(Op0))
3997 return 0;
3998 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
3999}
4000
4002 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4003 Value *Scalar,
4004 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
4005 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4006 ScalarUserAndIdx);
4007}
4008
4010 Type *Val,
4012 unsigned Index) const {
4013 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
4014}
4015
4019 unsigned Index) const {
4020 if (isa<FixedVectorType>(Val))
4022 Index);
4023
4024 // This typically requires both while and lastb instructions in order
4025 // to extract the last element. If this is in a loop the while
4026 // instruction can at least be hoisted out, although it will consume a
4027 // predicate register. The cost should be more expensive than the base
4028 // extract cost, which is 2 for most CPUs.
4029 return CostKind == TTI::TCK_CodeSize
4030 ? 2
4031 : ST->getVectorInsertExtractBaseCost() + 1;
4032}
4033
4035 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4036 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
4037 ArrayRef<Value *> VL) const {
4040 if (Ty->getElementType()->isFloatingPointTy())
4041 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4042 CostKind);
4043 unsigned VecInstCost =
4044 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4045 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4046}
4047
4048std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4050 TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
4051 std::function<InstructionCost(Type *)> InstCost) const {
4052 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4053 return std::nullopt;
4054 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4055 return std::nullopt;
4056
4057 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4058 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4060 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4061 Cost *= 2;
4062 Cost += InstCost(PromotedTy);
4063 if (IncludeTrunc)
4064 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4066 return Cost;
4067}
4068
4070 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4072 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4073
4074 // The code-generator is currently not able to handle scalable vectors
4075 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4076 // it. This change will be removed when code-generation for these types is
4077 // sufficiently reliable.
4078 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4079 if (VTy->getElementCount() == ElementCount::getScalable(1))
4081
4082 // TODO: Handle more cost kinds.
4084 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4085 Op2Info, Args, CxtI);
4086
4087 // Legalize the type.
4088 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4089 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4090
4091 // Increase the cost for half and bfloat types if not architecturally
4092 // supported.
4093 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4094 ISD == ISD::FDIV || ISD == ISD::FREM)
4095 if (auto PromotedCost = getFP16BF16PromoteCost(
4096 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4097 [&](Type *PromotedTy) {
4098 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4099 Op1Info, Op2Info);
4100 }))
4101 return *PromotedCost;
4102
4103 switch (ISD) {
4104 default:
4105 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4106 Op2Info);
4107 case ISD::SREM:
4108 case ISD::SDIV:
4109 /*
4110 Notes for sdiv/srem specific costs:
4111 1. This only considers the cases where the divisor is constant, uniform and
4112 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4113 result in some form of (ldr + adrp), corresponding to constant vectors, or
4114 scalarization of the division operation.
4115 2. Constant divisors, either negative in whole or partially, don't result in
4116 significantly different codegen as compared to positive constant divisors.
4117 So, we don't consider negative divisors separately.
4118 3. If the codegen is significantly different with SVE, it has been indicated
4119 using comments at appropriate places.
4120
4121 sdiv specific cases:
4122 -----------------------------------------------------------------------
4123 codegen | pow-of-2 | Type
4124 -----------------------------------------------------------------------
4125 add + cmp + csel + asr | Y | i64
4126 add + cmp + csel + asr | Y | i32
4127 -----------------------------------------------------------------------
4128
4129 srem specific cases:
4130 -----------------------------------------------------------------------
4131 codegen | pow-of-2 | Type
4132 -----------------------------------------------------------------------
4133 negs + and + and + csneg | Y | i64
4134 negs + and + and + csneg | Y | i32
4135 -----------------------------------------------------------------------
4136
4137 other sdiv/srem cases:
4138 -------------------------------------------------------------------------
4139 common codegen | + srem | + sdiv | pow-of-2 | Type
4140 -------------------------------------------------------------------------
4141 smulh + asr + add + add | - | - | N | i64
4142 smull + lsr + add + add | - | - | N | i32
4143 usra | and + sub | sshr | Y | <2 x i64>
4144 2 * (scalar code) | - | - | N | <2 x i64>
4145 usra | bic + sub | sshr + neg | Y | <4 x i32>
4146 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4147 + sshr + usra | | | |
4148 -------------------------------------------------------------------------
4149 */
4150 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4151 InstructionCost AddCost =
4152 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4153 Op1Info.getNoProps(), Op2Info.getNoProps());
4154 InstructionCost AsrCost =
4155 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4156 Op1Info.getNoProps(), Op2Info.getNoProps());
4157 InstructionCost MulCost =
4158 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4159 Op1Info.getNoProps(), Op2Info.getNoProps());
4160 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4161 // have similar cost.
4162 auto VT = TLI->getValueType(DL, Ty);
4163 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4164 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4165 // Neg can be folded into the asr instruction.
4166 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4167 : (3 * AsrCost + AddCost);
4168 } else {
4169 return MulCost + AsrCost + 2 * AddCost;
4170 }
4171 } else if (VT.isVector()) {
4172 InstructionCost UsraCost = 2 * AsrCost;
4173 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4174 // Division with scalable types corresponds to native 'asrd'
4175 // instruction when SVE is available.
4176 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4177
4178 // One more for the negation in SDIV
4180 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4181 if (Ty->isScalableTy() && ST->hasSVE())
4182 Cost += 2 * AsrCost;
4183 else {
4184 Cost +=
4185 UsraCost +
4186 (ISD == ISD::SDIV
4187 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4188 : 2 * AddCost);
4189 }
4190 return Cost;
4191 } else if (LT.second == MVT::v2i64) {
4192 return VT.getVectorNumElements() *
4193 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4194 Op1Info.getNoProps(),
4195 Op2Info.getNoProps());
4196 } else {
4197 // When SVE is available, we get:
4198 // smulh + lsr + add/sub + asr + add/sub.
4199 if (Ty->isScalableTy() && ST->hasSVE())
4200 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4201 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4202 }
4203 }
4204 }
4205 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4206 LT.second.isFixedLengthVector()) {
4207 // FIXME: When the constant vector is non-uniform, this may result in
4208 // loading the vector from constant pool or in some cases, may also result
4209 // in scalarization. For now, we are approximating this with the
4210 // scalarization cost.
4211 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4212 CostKind, -1, nullptr, nullptr);
4213 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4214 CostKind, -1, nullptr, nullptr);
4215 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4216 return ExtractCost + InsertCost +
4217 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4218 CostKind, Op1Info.getNoProps(),
4219 Op2Info.getNoProps());
4220 }
4221 [[fallthrough]];
4222 case ISD::UDIV:
4223 case ISD::UREM: {
4224 auto VT = TLI->getValueType(DL, Ty);
4225 if (Op2Info.isConstant()) {
4226 // If the operand is a power of 2 we can use the shift or and cost.
4227 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4228 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4229 Op1Info.getNoProps(),
4230 Op2Info.getNoProps());
4231 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4232 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4233 Op1Info.getNoProps(),
4234 Op2Info.getNoProps());
4235
4236 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4237 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4238 // The MULHU will be expanded to UMULL for the types not listed below,
4239 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4240 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4241 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4242 LT.second == MVT::nxv16i8;
4243 bool Is128bit = LT.second.is128BitVector();
4244
4245 InstructionCost MulCost =
4246 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4247 Op1Info.getNoProps(), Op2Info.getNoProps());
4248 InstructionCost AddCost =
4249 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4250 Op1Info.getNoProps(), Op2Info.getNoProps());
4251 InstructionCost ShrCost =
4252 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4253 Op1Info.getNoProps(), Op2Info.getNoProps());
4254 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4255 (HasMULH ? 0 : ShrCost) + // UMULL shift
4256 AddCost * 2 + ShrCost;
4257 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4258 }
4259 }
4260
4261 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4262 // emitted by the backend even when those functions are not declared in the
4263 // module.
4264 if (!VT.isVector() && VT.getSizeInBits() > 64)
4265 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4266
4268 Opcode, Ty, CostKind, Op1Info, Op2Info);
4269 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4270 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4271 // SDIV/UDIV operations are lowered using SVE, then we can have less
4272 // costs.
4273 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4274 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4275 static const CostTblEntry DivTbl[]{
4276 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4277 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4278 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4279 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4280 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4281 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4282
4283 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4284 if (nullptr != Entry)
4285 return Entry->Cost;
4286 }
4287 // For 8/16-bit elements, the cost is higher because the type
4288 // requires promotion and possibly splitting:
4289 if (LT.second.getScalarType() == MVT::i8)
4290 Cost *= 8;
4291 else if (LT.second.getScalarType() == MVT::i16)
4292 Cost *= 4;
4293 return Cost;
4294 } else {
4295 // If one of the operands is a uniform constant then the cost for each
4296 // element is Cost for insertion, extraction and division.
4297 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4298 // operation with scalar type
4299 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4300 (Op2Info.isConstant() && Op2Info.isUniform())) {
4301 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4303 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4304 return (4 + DivCost) * VTy->getNumElements();
4305 }
4306 }
4307 // On AArch64, without SVE, vector divisions are expanded
4308 // into scalar divisions of each pair of elements.
4309 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4310 -1, nullptr, nullptr);
4311 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4312 nullptr, nullptr);
4313 }
4314
4315 // TODO: if one of the arguments is scalar, then it's not necessary to
4316 // double the cost of handling the vector elements.
4317 Cost += Cost;
4318 }
4319 return Cost;
4320 }
4321 case ISD::MUL:
4322 // When SVE is available, then we can lower the v2i64 operation using
4323 // the SVE mul instruction, which has a lower cost.
4324 if (LT.second == MVT::v2i64 && ST->hasSVE())
4325 return LT.first;
4326
4327 // When SVE is not available, there is no MUL.2d instruction,
4328 // which means mul <2 x i64> is expensive as elements are extracted
4329 // from the vectors and the muls scalarized.
4330 // As getScalarizationOverhead is a bit too pessimistic, we
4331 // estimate the cost for a i64 vector directly here, which is:
4332 // - four 2-cost i64 extracts,
4333 // - two 2-cost i64 inserts, and
4334 // - two 1-cost muls.
4335 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4336 // LT.first = 2 the cost is 28. If both operands are extensions it will not
4337 // need to scalarize so the cost can be cheaper (smull or umull).
4338 // so the cost can be cheaper (smull or umull).
4339 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
4340 return LT.first;
4341 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4342 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4343 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4344 nullptr, nullptr) *
4345 2 +
4346 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4347 nullptr, nullptr));
4348 case ISD::ADD:
4349 case ISD::XOR:
4350 case ISD::OR:
4351 case ISD::AND:
4352 case ISD::SRL:
4353 case ISD::SRA:
4354 case ISD::SHL:
4355 // These nodes are marked as 'custom' for combining purposes only.
4356 // We know that they are legal. See LowerAdd in ISelLowering.
4357 return LT.first;
4358
4359 case ISD::FNEG:
4360 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4361 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4362 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4363 CxtI &&
4364 ((CxtI->hasOneUse() &&
4365 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4366 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4367 return 0;
4368 [[fallthrough]];
4369 case ISD::FADD:
4370 case ISD::FSUB:
4371 if (!Ty->getScalarType()->isFP128Ty())
4372 return LT.first;
4373 [[fallthrough]];
4374 case ISD::FMUL:
4375 case ISD::FDIV:
4376 // These nodes are marked as 'custom' just to lower them to SVE.
4377 // We know said lowering will incur no additional cost.
4378 if (!Ty->getScalarType()->isFP128Ty())
4379 return 2 * LT.first;
4380
4381 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4382 Op2Info);
4383 case ISD::FREM:
4384 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4385 // those functions are not declared in the module.
4386 if (!Ty->isVectorTy())
4387 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4388 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4389 Op2Info);
4390 }
4391}
4392
4395 const SCEV *Ptr,
4397 // Address computations in vectorized code with non-consecutive addresses will
4398 // likely result in more instructions compared to scalar code where the
4399 // computation can more often be merged into the index mode. The resulting
4400 // extra micro-ops can significantly decrease throughput.
4401 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4402 int MaxMergeDistance = 64;
4403
4404 if (PtrTy->isVectorTy() && SE &&
4405 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4406 return NumVectorInstToHideOverhead;
4407
4408 // In many cases the address computation is not merged into the instruction
4409 // addressing mode.
4410 return 1;
4411}
4412
4413/// Check whether Opcode1 has less throughput according to the scheduling
4414/// model than Opcode2.
4416 unsigned Opcode1, unsigned Opcode2) const {
4417 const MCSchedModel &Sched = ST->getSchedModel();
4418 const TargetInstrInfo *TII = ST->getInstrInfo();
4419 if (!Sched.hasInstrSchedModel())
4420 return false;
4421
4422 const MCSchedClassDesc *SCD1 =
4423 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4424 const MCSchedClassDesc *SCD2 =
4425 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4426 // We cannot handle variant scheduling classes without an MI. If we need to
4427 // support them for any of the instructions we query the information of we
4428 // might need to add a way to resolve them without a MI or not use the
4429 // scheduling info.
4430 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4431 "Cannot handle variant scheduling classes without an MI");
4432 if (!SCD1->isValid() || !SCD2->isValid())
4433 return false;
4434
4435 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4437}
4438
4440 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4442 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4443 // We don't lower some vector selects well that are wider than the register
4444 // width. TODO: Improve this with different cost kinds.
4445 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4446 // We would need this many instructions to hide the scalarization happening.
4447 const int AmortizationCost = 20;
4448
4449 // If VecPred is not set, check if we can get a predicate from the context
4450 // instruction, if its type matches the requested ValTy.
4451 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4452 CmpPredicate CurrentPred;
4453 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4454 m_Value())))
4455 VecPred = CurrentPred;
4456 }
4457 // Check if we have a compare/select chain that can be lowered using
4458 // a (F)CMxx & BFI pair.
4459 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4460 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4461 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4462 VecPred == CmpInst::FCMP_UNE) {
4463 static const auto ValidMinMaxTys = {
4464 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4465 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4466 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4467
4468 auto LT = getTypeLegalizationCost(ValTy);
4469 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
4470 (ST->hasFullFP16() &&
4471 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
4472 return LT.first;
4473 }
4474
4475 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4476 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4477 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4478 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4479 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4480 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4481 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4482 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4483 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4484 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4485 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4486 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4487
4488 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4489 EVT SelValTy = TLI->getValueType(DL, ValTy);
4490 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4491 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4492 SelCondTy.getSimpleVT(),
4493 SelValTy.getSimpleVT()))
4494 return Entry->Cost;
4495 }
4496 }
4497
4498 if (Opcode == Instruction::FCmp) {
4499 if (auto PromotedCost = getFP16BF16PromoteCost(
4500 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4501 [&](Type *PromotedTy) {
4503 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4504 CostKind, Op1Info, Op2Info);
4505 if (isa<VectorType>(PromotedTy))
4507 Instruction::Trunc,
4511 return Cost;
4512 }))
4513 return *PromotedCost;
4514
4515 auto LT = getTypeLegalizationCost(ValTy);
4516 // Model unknown fp compares as a libcall.
4517 if (LT.second.getScalarType() != MVT::f64 &&
4518 LT.second.getScalarType() != MVT::f32 &&
4519 LT.second.getScalarType() != MVT::f16)
4520 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4521 {ValTy, ValTy}, CostKind);
4522
4523 // Some comparison operators require expanding to multiple compares + or.
4524 unsigned Factor = 1;
4525 if (!CondTy->isVectorTy() &&
4526 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4527 Factor = 2; // fcmp with 2 selects
4528 else if (isa<FixedVectorType>(ValTy) &&
4529 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4530 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4531 Factor = 3; // fcmxx+fcmyy+or
4532 else if (isa<ScalableVectorType>(ValTy) &&
4533 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4534 Factor = 3; // fcmxx+fcmyy+or
4535
4536 if (isa<ScalableVectorType>(ValTy) &&
4538 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4539 AArch64::FCMEQv4f32))
4540 Factor *= 2;
4541
4542 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4543 }
4544
4545 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4546 // icmp(and, 0) as free, as we can make use of ands, but only if the
4547 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4548 // providing it will not cause performance regressions.
4549 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4550 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4551 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4552 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4553 if (match(I->getOperand(1), m_Zero()))
4554 return 0;
4555
4556 // x >= 1 / x < 1 -> x > 0 / x <= 0
4557 if (match(I->getOperand(1), m_One()) &&
4558 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4559 return 0;
4560
4561 // x <= -1 / x > -1 -> x > 0 / x <= 0
4562 if (match(I->getOperand(1), m_AllOnes()) &&
4563 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4564 return 0;
4565 }
4566
4567 // The base case handles scalable vectors fine for now, since it treats the
4568 // cost as 1 * legalization cost.
4569 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4570 Op1Info, Op2Info, I);
4571}
4572
4574AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4576 if (ST->requiresStrictAlign()) {
4577 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4578 // a bunch of instructions when strict align is enabled.
4579 return Options;
4580 }
4581 Options.AllowOverlappingLoads = true;
4582 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4583 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4584 // TODO: Though vector loads usually perform well on AArch64, in some targets
4585 // they may wake up the FP unit, which raises the power consumption. Perhaps
4586 // they could be used with no holds barred (-O3).
4587 Options.LoadSizes = {8, 4, 2, 1};
4588 Options.AllowedTailExpansions = {3, 5, 6};
4589 return Options;
4590}
4591
4593 return ST->hasSVE();
4594}
4595
4598 Align Alignment, unsigned AddressSpace,
4600 if (useNeonVector(Src))
4601 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4602 CostKind);
4603 auto LT = getTypeLegalizationCost(Src);
4604 if (!LT.first.isValid())
4606
4607 // Return an invalid cost for element types that we are unable to lower.
4608 auto *VT = cast<VectorType>(Src);
4609 if (VT->getElementType()->isIntegerTy(1))
4611
4612 // The code-generator is currently not able to handle scalable vectors
4613 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4614 // it. This change will be removed when code-generation for these types is
4615 // sufficiently reliable.
4616 if (VT->getElementCount() == ElementCount::getScalable(1))
4618
4619 return LT.first;
4620}
4621
4622// This function returns gather/scatter overhead either from
4623// user-provided value or specialized values per-target from \p ST.
4624static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4625 const AArch64Subtarget *ST) {
4626 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4627 "Should be called on only load or stores.");
4628 switch (Opcode) {
4629 case Instruction::Load:
4630 if (SVEGatherOverhead.getNumOccurrences() > 0)
4631 return SVEGatherOverhead;
4632 return ST->getGatherOverhead();
4633 break;
4634 case Instruction::Store:
4635 if (SVEScatterOverhead.getNumOccurrences() > 0)
4636 return SVEScatterOverhead;
4637 return ST->getScatterOverhead();
4638 break;
4639 default:
4640 llvm_unreachable("Shouldn't have reached here");
4641 }
4642}
4643
4645 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
4646 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
4647 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
4648 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
4649 Alignment, CostKind, I);
4650 auto *VT = cast<VectorType>(DataTy);
4651 auto LT = getTypeLegalizationCost(DataTy);
4652 if (!LT.first.isValid())
4654
4655 // Return an invalid cost for element types that we are unable to lower.
4656 if (!LT.second.isVector() ||
4657 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
4658 VT->getElementType()->isIntegerTy(1))
4660
4661 // The code-generator is currently not able to handle scalable vectors
4662 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4663 // it. This change will be removed when code-generation for these types is
4664 // sufficiently reliable.
4665 if (VT->getElementCount() == ElementCount::getScalable(1))
4667
4668 ElementCount LegalVF = LT.second.getVectorElementCount();
4669 InstructionCost MemOpCost =
4670 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
4671 {TTI::OK_AnyValue, TTI::OP_None}, I);
4672 // Add on an overhead cost for using gathers/scatters.
4673 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4674 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
4675}
4676
4678 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
4679}
4680
4682 Align Alignment,
4683 unsigned AddressSpace,
4685 TTI::OperandValueInfo OpInfo,
4686 const Instruction *I) const {
4687 EVT VT = TLI->getValueType(DL, Ty, true);
4688 // Type legalization can't handle structs
4689 if (VT == MVT::Other)
4690 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
4691 CostKind);
4692
4693 auto LT = getTypeLegalizationCost(Ty);
4694 if (!LT.first.isValid())
4696
4697 // The code-generator is currently not able to handle scalable vectors
4698 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4699 // it. This change will be removed when code-generation for these types is
4700 // sufficiently reliable.
4701 // We also only support full register predicate loads and stores.
4702 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4703 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
4704 (VTy->getElementType()->isIntegerTy(1) &&
4705 !VTy->getElementCount().isKnownMultipleOf(
4708
4709 // TODO: consider latency as well for TCK_SizeAndLatency.
4711 return LT.first;
4712
4714 return 1;
4715
4716 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
4717 LT.second.is128BitVector() && Alignment < Align(16)) {
4718 // Unaligned stores are extremely inefficient. We don't split all
4719 // unaligned 128-bit stores because the negative impact that has shown in
4720 // practice on inlined block copy code.
4721 // We make such stores expensive so that we will only vectorize if there
4722 // are 6 other instructions getting vectorized.
4723 const int AmortizationCost = 6;
4724
4725 return LT.first * 2 * AmortizationCost;
4726 }
4727
4728 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
4729 if (Ty->isPtrOrPtrVectorTy())
4730 return LT.first;
4731
4732 if (useNeonVector(Ty)) {
4733 // Check truncating stores and extending loads.
4734 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
4735 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
4736 if (VT == MVT::v4i8)
4737 return 2;
4738 // Otherwise we need to scalarize.
4739 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
4740 }
4741 EVT EltVT = VT.getVectorElementType();
4742 unsigned EltSize = EltVT.getScalarSizeInBits();
4743 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
4744 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
4745 return LT.first;
4746 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
4747 // widening to v4i8, which produces suboptimal results.
4748 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
4749 return LT.first;
4750
4751 // Check non-power-of-2 loads/stores for legal vector element types with
4752 // NEON. Non-power-of-2 memory ops will get broken down to a set of
4753 // operations on smaller power-of-2 ops, including ld1/st1.
4754 LLVMContext &C = Ty->getContext();
4756 SmallVector<EVT> TypeWorklist;
4757 TypeWorklist.push_back(VT);
4758 while (!TypeWorklist.empty()) {
4759 EVT CurrVT = TypeWorklist.pop_back_val();
4760 unsigned CurrNumElements = CurrVT.getVectorNumElements();
4761 if (isPowerOf2_32(CurrNumElements)) {
4762 Cost += 1;
4763 continue;
4764 }
4765
4766 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
4767 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
4768 TypeWorklist.push_back(
4769 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
4770 }
4771 return Cost;
4772 }
4773
4774 return LT.first;
4775}
4776
4778 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
4779 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4780 bool UseMaskForCond, bool UseMaskForGaps) const {
4781 assert(Factor >= 2 && "Invalid interleave factor");
4782 auto *VecVTy = cast<VectorType>(VecTy);
4783
4784 if (VecTy->isScalableTy() && !ST->hasSVE())
4786
4787 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
4788 // only have lowering for power-of-2 factors.
4789 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
4790 // InterleavedAccessPass for ld3/st3
4791 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
4793
4794 // Vectorization for masked interleaved accesses is only enabled for scalable
4795 // VF.
4796 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
4798
4799 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
4800 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
4801 auto *SubVecTy =
4802 VectorType::get(VecVTy->getElementType(),
4803 VecVTy->getElementCount().divideCoefficientBy(Factor));
4804
4805 // ldN/stN only support legal vector types of size 64 or 128 in bits.
4806 // Accesses having vector types that are a multiple of 128 bits can be
4807 // matched to more than one ldN/stN instruction.
4808 bool UseScalable;
4809 if (MinElts % Factor == 0 &&
4810 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
4811 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
4812 }
4813
4814 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4815 Alignment, AddressSpace, CostKind,
4816 UseMaskForCond, UseMaskForGaps);
4817}
4818
4823 for (auto *I : Tys) {
4824 if (!I->isVectorTy())
4825 continue;
4826 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
4827 128)
4828 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
4829 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
4830 }
4831 return Cost;
4832}
4833
4835 return ST->getMaxInterleaveFactor();
4836}
4837
4838// For Falkor, we want to avoid having too many strided loads in a loop since
4839// that can exhaust the HW prefetcher resources. We adjust the unroller
4840// MaxCount preference below to attempt to ensure unrolling doesn't create too
4841// many strided loads.
4842static void
4845 enum { MaxStridedLoads = 7 };
4846 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
4847 int StridedLoads = 0;
4848 // FIXME? We could make this more precise by looking at the CFG and
4849 // e.g. not counting loads in each side of an if-then-else diamond.
4850 for (const auto BB : L->blocks()) {
4851 for (auto &I : *BB) {
4852 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
4853 if (!LMemI)
4854 continue;
4855
4856 Value *PtrValue = LMemI->getPointerOperand();
4857 if (L->isLoopInvariant(PtrValue))
4858 continue;
4859
4860 const SCEV *LSCEV = SE.getSCEV(PtrValue);
4861 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
4862 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
4863 continue;
4864
4865 // FIXME? We could take pairing of unrolled load copies into account
4866 // by looking at the AddRec, but we would probably have to limit this
4867 // to loops with no stores or other memory optimization barriers.
4868 ++StridedLoads;
4869 // We've seen enough strided loads that seeing more won't make a
4870 // difference.
4871 if (StridedLoads > MaxStridedLoads / 2)
4872 return StridedLoads;
4873 }
4874 }
4875 return StridedLoads;
4876 };
4877
4878 int StridedLoads = countStridedLoads(L, SE);
4879 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
4880 << " strided loads\n");
4881 // Pick the largest power of 2 unroll count that won't result in too many
4882 // strided loads.
4883 if (StridedLoads) {
4884 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
4885 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
4886 << UP.MaxCount << '\n');
4887 }
4888}
4889
4890// This function returns true if the loop:
4891// 1. Has a valid cost, and
4892// 2. Has a cost within the supplied budget.
4893// Otherwise it returns false.
4895 InstructionCost Budget,
4896 unsigned *FinalSize) {
4897 // Estimate the size of the loop.
4898 InstructionCost LoopCost = 0;
4899
4900 for (auto *BB : L->getBlocks()) {
4901 for (auto &I : *BB) {
4902 SmallVector<const Value *, 4> Operands(I.operand_values());
4903 InstructionCost Cost =
4904 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
4905 // This can happen with intrinsics that don't currently have a cost model
4906 // or for some operations that require SVE.
4907 if (!Cost.isValid())
4908 return false;
4909
4910 LoopCost += Cost;
4911 if (LoopCost > Budget)
4912 return false;
4913 }
4914 }
4915
4916 if (FinalSize)
4917 *FinalSize = LoopCost.getValue();
4918 return true;
4919}
4920
4922 const AArch64TTIImpl &TTI) {
4923 // Only consider loops with unknown trip counts for which we can determine
4924 // a symbolic expression. Multi-exit loops with small known trip counts will
4925 // likely be unrolled anyway.
4926 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
4928 return false;
4929
4930 // It might not be worth unrolling loops with low max trip counts. Restrict
4931 // this to max trip counts > 32 for now.
4932 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
4933 if (MaxTC > 0 && MaxTC <= 32)
4934 return false;
4935
4936 // Make sure the loop size is <= 5.
4937 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
4938 return false;
4939
4940 // Small search loops with multiple exits can be highly beneficial to unroll.
4941 // We only care about loops with exactly two exiting blocks, although each
4942 // block could jump to the same exit block.
4943 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
4944 if (Blocks.size() != 2)
4945 return false;
4946
4947 if (any_of(Blocks, [](BasicBlock *BB) {
4948 return !isa<BranchInst>(BB->getTerminator());
4949 }))
4950 return false;
4951
4952 return true;
4953}
4954
4955/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
4956/// OOO engine's wide instruction window and various predictors.
4957static void
4960 const AArch64TTIImpl &TTI) {
4961 // Limit loops with structure that is highly likely to benefit from runtime
4962 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
4963 // likely with complex control flow). Note that the heuristics here may be
4964 // overly conservative and we err on the side of avoiding runtime unrolling
4965 // rather than unroll excessively. They are all subject to further refinement.
4966 if (!L->isInnermost() || L->getNumBlocks() > 8)
4967 return;
4968
4969 // Loops with multiple exits are handled by common code.
4970 if (!L->getExitBlock())
4971 return;
4972
4973 // Check if the loop contains any reductions that could be parallelized when
4974 // unrolling. If so, enable partial unrolling, if the trip count is know to be
4975 // a multiple of 2.
4976 bool HasParellelizableReductions =
4977 L->getNumBlocks() == 1 &&
4978 any_of(L->getHeader()->phis(),
4979 [&SE, L](PHINode &Phi) {
4980 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
4981 }) &&
4982 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
4983 if (HasParellelizableReductions &&
4984 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
4985 UP.Partial = true;
4986 UP.MaxCount = 4;
4987 UP.AddAdditionalAccumulators = true;
4988 }
4989
4990 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
4992 (SE.getSmallConstantMaxTripCount(L) > 0 &&
4993 SE.getSmallConstantMaxTripCount(L) <= 32))
4994 return;
4995
4996 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
4997 return;
4998
5000 return;
5001
5002 // Limit to loops with trip counts that are cheap to expand.
5003 UP.SCEVExpansionBudget = 1;
5004
5005 if (HasParellelizableReductions) {
5006 UP.Runtime = true;
5008 UP.AddAdditionalAccumulators = true;
5009 }
5010
5011 // Try to unroll small loops, of few-blocks with low budget, if they have
5012 // load/store dependencies, to expose more parallel memory access streams,
5013 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5014 BasicBlock *Header = L->getHeader();
5015 BasicBlock *Latch = L->getLoopLatch();
5016 if (Header == Latch) {
5017 // Estimate the size of the loop.
5018 unsigned Size;
5019 unsigned Width = 10;
5020 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5021 return;
5022
5023 // Try to find an unroll count that maximizes the use of the instruction
5024 // window, i.e. trying to fetch as many instructions per cycle as possible.
5025 unsigned MaxInstsPerLine = 16;
5026 unsigned UC = 1;
5027 unsigned BestUC = 1;
5028 unsigned SizeWithBestUC = BestUC * Size;
5029 while (UC <= 8) {
5030 unsigned SizeWithUC = UC * Size;
5031 if (SizeWithUC > 48)
5032 break;
5033 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5034 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5035 BestUC = UC;
5036 SizeWithBestUC = BestUC * Size;
5037 }
5038 UC++;
5039 }
5040
5041 if (BestUC == 1)
5042 return;
5043
5044 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5046 for (auto *BB : L->blocks()) {
5047 for (auto &I : *BB) {
5049 if (!Ptr)
5050 continue;
5051 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5052 if (SE.isLoopInvariant(PtrSCEV, L))
5053 continue;
5054 if (isa<LoadInst>(&I)) {
5055 LoadedValuesPlus.insert(&I);
5056 // Include in-loop 1st users of loaded values.
5057 for (auto *U : I.users())
5058 if (L->contains(cast<Instruction>(U)))
5059 LoadedValuesPlus.insert(U);
5060 } else
5061 Stores.push_back(cast<StoreInst>(&I));
5062 }
5063 }
5064
5065 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5066 return LoadedValuesPlus.contains(SI->getOperand(0));
5067 }))
5068 return;
5069
5070 UP.Runtime = true;
5071 UP.DefaultUnrollRuntimeCount = BestUC;
5072 return;
5073 }
5074
5075 // Try to runtime-unroll loops with early-continues depending on loop-varying
5076 // loads; this helps with branch-prediction for the early-continues.
5077 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
5079 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
5080 !llvm::is_contained(Preds, Header) ||
5081 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5082 return;
5083
5084 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5085 [&](Instruction *I, unsigned Depth) -> bool {
5086 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5087 return false;
5088
5089 if (isa<LoadInst>(I))
5090 return true;
5091
5092 return any_of(I->operands(), [&](Value *V) {
5093 auto *I = dyn_cast<Instruction>(V);
5094 return I && DependsOnLoopLoad(I, Depth + 1);
5095 });
5096 };
5097 CmpPredicate Pred;
5098 Instruction *I;
5099 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5100 m_Value())) &&
5101 DependsOnLoopLoad(I, 0)) {
5102 UP.Runtime = true;
5103 }
5104}
5105
5108 OptimizationRemarkEmitter *ORE) const {
5109 // Enable partial unrolling and runtime unrolling.
5110 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5111
5112 UP.UpperBound = true;
5113
5114 // For inner loop, it is more likely to be a hot one, and the runtime check
5115 // can be promoted out from LICM pass, so the overhead is less, let's try
5116 // a larger threshold to unroll more loops.
5117 if (L->getLoopDepth() > 1)
5118 UP.PartialThreshold *= 2;
5119
5120 // Disable partial & runtime unrolling on -Os.
5122
5123 // Scan the loop: don't unroll loops with calls as this could prevent
5124 // inlining. Don't unroll auto-vectorized loops either, though do allow
5125 // unrolling of the scalar remainder.
5126 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5127 for (auto *BB : L->getBlocks()) {
5128 for (auto &I : *BB) {
5129 // Both auto-vectorized loops and the scalar remainder have the
5130 // isvectorized attribute, so differentiate between them by the presence
5131 // of vector instructions.
5132 if (IsVectorized && I.getType()->isVectorTy())
5133 return;
5134 if (isa<CallBase>(I)) {
5137 if (!isLoweredToCall(F))
5138 continue;
5139 return;
5140 }
5141 }
5142 }
5143
5144 // Apply subtarget-specific unrolling preferences.
5145 switch (ST->getProcFamily()) {
5146 case AArch64Subtarget::AppleA14:
5147 case AArch64Subtarget::AppleA15:
5148 case AArch64Subtarget::AppleA16:
5149 case AArch64Subtarget::AppleM4:
5150 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5151 break;
5152 case AArch64Subtarget::Falkor:
5155 break;
5156 default:
5157 break;
5158 }
5159
5160 // If this is a small, multi-exit loop similar to something like std::find,
5161 // then there is typically a performance improvement achieved by unrolling.
5162 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5163 UP.RuntimeUnrollMultiExit = true;
5164 UP.Runtime = true;
5165 // Limit unroll count.
5167 // Allow slightly more costly trip-count expansion to catch search loops
5168 // with pointer inductions.
5169 UP.SCEVExpansionBudget = 5;
5170 return;
5171 }
5172
5173 // Enable runtime unrolling for in-order models
5174 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5175 // checking for that case, we can ensure that the default behaviour is
5176 // unchanged
5177 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5178 !ST->getSchedModel().isOutOfOrder()) {
5179 UP.Runtime = true;
5180 UP.Partial = true;
5181 UP.UnrollRemainder = true;
5183
5184 UP.UnrollAndJam = true;
5186 }
5187}
5188
5193
5195 Type *ExpectedType,
5196 bool CanCreate) const {
5197 switch (Inst->getIntrinsicID()) {
5198 default:
5199 return nullptr;
5200 case Intrinsic::aarch64_neon_st2:
5201 case Intrinsic::aarch64_neon_st3:
5202 case Intrinsic::aarch64_neon_st4: {
5203 // Create a struct type
5204 StructType *ST = dyn_cast<StructType>(ExpectedType);
5205 if (!CanCreate || !ST)
5206 return nullptr;
5207 unsigned NumElts = Inst->arg_size() - 1;
5208 if (ST->getNumElements() != NumElts)
5209 return nullptr;
5210 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5211 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5212 return nullptr;
5213 }
5214 Value *Res = PoisonValue::get(ExpectedType);
5215 IRBuilder<> Builder(Inst);
5216 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5217 Value *L = Inst->getArgOperand(i);
5218 Res = Builder.CreateInsertValue(Res, L, i);
5219 }
5220 return Res;
5221 }
5222 case Intrinsic::aarch64_neon_ld2:
5223 case Intrinsic::aarch64_neon_ld3:
5224 case Intrinsic::aarch64_neon_ld4:
5225 if (Inst->getType() == ExpectedType)
5226 return Inst;
5227 return nullptr;
5228 }
5229}
5230
5232 MemIntrinsicInfo &Info) const {
5233 switch (Inst->getIntrinsicID()) {
5234 default:
5235 break;
5236 case Intrinsic::aarch64_neon_ld2:
5237 case Intrinsic::aarch64_neon_ld3:
5238 case Intrinsic::aarch64_neon_ld4:
5239 Info.ReadMem = true;
5240 Info.WriteMem = false;
5241 Info.PtrVal = Inst->getArgOperand(0);
5242 break;
5243 case Intrinsic::aarch64_neon_st2:
5244 case Intrinsic::aarch64_neon_st3:
5245 case Intrinsic::aarch64_neon_st4:
5246 Info.ReadMem = false;
5247 Info.WriteMem = true;
5248 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5249 break;
5250 }
5251
5252 switch (Inst->getIntrinsicID()) {
5253 default:
5254 return false;
5255 case Intrinsic::aarch64_neon_ld2:
5256 case Intrinsic::aarch64_neon_st2:
5257 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5258 break;
5259 case Intrinsic::aarch64_neon_ld3:
5260 case Intrinsic::aarch64_neon_st3:
5261 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5262 break;
5263 case Intrinsic::aarch64_neon_ld4:
5264 case Intrinsic::aarch64_neon_st4:
5265 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5266 break;
5267 }
5268 return true;
5269}
5270
5271/// See if \p I should be considered for address type promotion. We check if \p
5272/// I is a sext with right type and used in memory accesses. If it used in a
5273/// "complex" getelementptr, we allow it to be promoted without finding other
5274/// sext instructions that sign extended the same initial value. A getelementptr
5275/// is considered as "complex" if it has more than 2 operands.
5277 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5278 bool Considerable = false;
5279 AllowPromotionWithoutCommonHeader = false;
5280 if (!isa<SExtInst>(&I))
5281 return false;
5282 Type *ConsideredSExtType =
5283 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5284 if (I.getType() != ConsideredSExtType)
5285 return false;
5286 // See if the sext is the one with the right type and used in at least one
5287 // GetElementPtrInst.
5288 for (const User *U : I.users()) {
5289 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5290 Considerable = true;
5291 // A getelementptr is considered as "complex" if it has more than 2
5292 // operands. We will promote a SExt used in such complex GEP as we
5293 // expect some computation to be merged if they are done on 64 bits.
5294 if (GEPInst->getNumOperands() > 2) {
5295 AllowPromotionWithoutCommonHeader = true;
5296 break;
5297 }
5298 }
5299 }
5300 return Considerable;
5301}
5302
5304 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5305 if (!VF.isScalable())
5306 return true;
5307
5308 Type *Ty = RdxDesc.getRecurrenceType();
5309 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5310 return false;
5311
5312 switch (RdxDesc.getRecurrenceKind()) {
5313 case RecurKind::Sub:
5315 case RecurKind::Add:
5316 case RecurKind::FAdd:
5317 case RecurKind::And:
5318 case RecurKind::Or:
5319 case RecurKind::Xor:
5320 case RecurKind::SMin:
5321 case RecurKind::SMax:
5322 case RecurKind::UMin:
5323 case RecurKind::UMax:
5324 case RecurKind::FMin:
5325 case RecurKind::FMax:
5326 case RecurKind::FMulAdd:
5327 case RecurKind::AnyOf:
5328 return true;
5329 default:
5330 return false;
5331 }
5332}
5333
5336 FastMathFlags FMF,
5338 // The code-generator is currently not able to handle scalable vectors
5339 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5340 // it. This change will be removed when code-generation for these types is
5341 // sufficiently reliable.
5342 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5343 if (VTy->getElementCount() == ElementCount::getScalable(1))
5345
5346 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5347
5348 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5349 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5350
5351 InstructionCost LegalizationCost = 0;
5352 if (LT.first > 1) {
5353 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5354 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5355 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5356 }
5357
5358 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5359}
5360
5362 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5363 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5364 InstructionCost LegalizationCost = 0;
5365 if (LT.first > 1) {
5366 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5367 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5368 LegalizationCost *= LT.first - 1;
5369 }
5370
5371 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5372 assert(ISD && "Invalid opcode");
5373 // Add the final reduction cost for the legal horizontal reduction
5374 switch (ISD) {
5375 case ISD::ADD:
5376 case ISD::AND:
5377 case ISD::OR:
5378 case ISD::XOR:
5379 case ISD::FADD:
5380 return LegalizationCost + 2;
5381 default:
5383 }
5384}
5385
5388 std::optional<FastMathFlags> FMF,
5390 // The code-generator is currently not able to handle scalable vectors
5391 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5392 // it. This change will be removed when code-generation for these types is
5393 // sufficiently reliable.
5394 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5395 if (VTy->getElementCount() == ElementCount::getScalable(1))
5397
5399 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5400 InstructionCost BaseCost =
5401 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5402 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5403 // end up vectorizing for more computationally intensive loops.
5404 return BaseCost + FixedVTy->getNumElements();
5405 }
5406
5407 if (Opcode != Instruction::FAdd)
5409
5410 auto *VTy = cast<ScalableVectorType>(ValTy);
5412 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5413 Cost *= getMaxNumElements(VTy->getElementCount());
5414 return Cost;
5415 }
5416
5417 if (isa<ScalableVectorType>(ValTy))
5418 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5419
5420 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5421 MVT MTy = LT.second;
5422 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5423 assert(ISD && "Invalid opcode");
5424
5425 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5426 // instructions as twice a normal vector add, plus 1 for each legalization
5427 // step (LT.first). This is the only arithmetic vector reduction operation for
5428 // which we have an instruction.
5429 // OR, XOR and AND costs should match the codegen from:
5430 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5431 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5432 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5433 static const CostTblEntry CostTblNoPairwise[]{
5434 {ISD::ADD, MVT::v8i8, 2},
5435 {ISD::ADD, MVT::v16i8, 2},
5436 {ISD::ADD, MVT::v4i16, 2},
5437 {ISD::ADD, MVT::v8i16, 2},
5438 {ISD::ADD, MVT::v2i32, 2},
5439 {ISD::ADD, MVT::v4i32, 2},
5440 {ISD::ADD, MVT::v2i64, 2},
5441 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5442 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5443 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5444 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5445 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5446 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5447 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5448 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5449 {ISD::XOR, MVT::v16i8, 7},
5450 {ISD::XOR, MVT::v4i16, 4},
5451 {ISD::XOR, MVT::v8i16, 6},
5452 {ISD::XOR, MVT::v2i32, 3},
5453 {ISD::XOR, MVT::v4i32, 5},
5454 {ISD::XOR, MVT::v2i64, 3},
5455 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5456 {ISD::AND, MVT::v16i8, 7},
5457 {ISD::AND, MVT::v4i16, 4},
5458 {ISD::AND, MVT::v8i16, 6},
5459 {ISD::AND, MVT::v2i32, 3},
5460 {ISD::AND, MVT::v4i32, 5},
5461 {ISD::AND, MVT::v2i64, 3},
5462 };
5463 switch (ISD) {
5464 default:
5465 break;
5466 case ISD::FADD:
5467 if (Type *EltTy = ValTy->getScalarType();
5468 // FIXME: For half types without fullfp16 support, this could extend and
5469 // use a fp32 faddp reduction but current codegen unrolls.
5470 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5471 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5472 const unsigned NElts = MTy.getVectorNumElements();
5473 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5474 isPowerOf2_32(NElts))
5475 // Reduction corresponding to series of fadd instructions is lowered to
5476 // series of faddp instructions. faddp has latency/throughput that
5477 // matches fadd instruction and hence, every faddp instruction can be
5478 // considered to have a relative cost = 1 with
5479 // CostKind = TCK_RecipThroughput.
5480 // An faddp will pairwise add vector elements, so the size of input
5481 // vector reduces by half every time, requiring
5482 // #(faddp instructions) = log2_32(NElts).
5483 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5484 }
5485 break;
5486 case ISD::ADD:
5487 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5488 return (LT.first - 1) + Entry->Cost;
5489 break;
5490 case ISD::XOR:
5491 case ISD::AND:
5492 case ISD::OR:
5493 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5494 if (!Entry)
5495 break;
5496 auto *ValVTy = cast<FixedVectorType>(ValTy);
5497 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5498 isPowerOf2_32(ValVTy->getNumElements())) {
5499 InstructionCost ExtraCost = 0;
5500 if (LT.first != 1) {
5501 // Type needs to be split, so there is an extra cost of LT.first - 1
5502 // arithmetic ops.
5503 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5504 MTy.getVectorNumElements());
5505 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5506 ExtraCost *= LT.first - 1;
5507 }
5508 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5509 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5510 return Cost + ExtraCost;
5511 }
5512 break;
5513 }
5514 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5515}
5516
5518 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5519 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5520 EVT VecVT = TLI->getValueType(DL, VecTy);
5521 EVT ResVT = TLI->getValueType(DL, ResTy);
5522
5523 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5524 VecVT.getSizeInBits() >= 64) {
5525 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5526
5527 // The legal cases are:
5528 // UADDLV 8/16/32->32
5529 // UADDLP 32->64
5530 unsigned RevVTSize = ResVT.getSizeInBits();
5531 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5532 RevVTSize <= 32) ||
5533 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5534 RevVTSize <= 32) ||
5535 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5536 RevVTSize <= 64))
5537 return (LT.first - 1) * 2 + 2;
5538 }
5539
5540 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5541 CostKind);
5542}
5543
5545AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5546 Type *ResTy, VectorType *VecTy,
5548 EVT VecVT = TLI->getValueType(DL, VecTy);
5549 EVT ResVT = TLI->getValueType(DL, ResTy);
5550
5551 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5552 RedOpcode == Instruction::Add) {
5553 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5554
5555 // The legal cases with dotprod are
5556 // UDOT 8->32
5557 // Which requires an additional uaddv to sum the i32 values.
5558 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5559 ResVT == MVT::i32)
5560 return LT.first + 2;
5561 }
5562
5563 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
5564 CostKind);
5565}
5566
5570 static const CostTblEntry ShuffleTbl[] = {
5571 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5572 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5573 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5574 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5575 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5576 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5577 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5578 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5579 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5580 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5581 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5582 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5583 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5584 };
5585
5586 // The code-generator is currently not able to handle scalable vectors
5587 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5588 // it. This change will be removed when code-generation for these types is
5589 // sufficiently reliable.
5592
5593 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
5594 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
5595 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5596 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5597 : LT.second;
5598 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5599 InstructionCost LegalizationCost = 0;
5600 if (Index < 0) {
5601 LegalizationCost =
5602 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5604 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5606 }
5607
5608 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5609 // Cost performed on a promoted type.
5610 if (LT.second.getScalarType() == MVT::i1) {
5611 LegalizationCost +=
5612 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5614 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
5616 }
5617 const auto *Entry =
5618 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
5619 assert(Entry && "Illegal Type for Splice");
5620 LegalizationCost += Entry->Cost;
5621 return LegalizationCost * LT.first;
5622}
5623
5625 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5627 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5631
5633 return Invalid;
5634
5635 // Sub opcodes currently only occur in chained cases.
5636 // Independent partial reduction subtractions are still costed as an add
5637 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) ||
5638 OpAExtend == TTI::PR_None)
5639 return Invalid;
5640
5641 // We only support multiply binary operations for now, and for muls we
5642 // require the types being extended to be the same.
5643 // NOTE: For muls AArch64 supports lowering mixed extensions to a usdot but
5644 // only if the i8mm or sve/streaming features are available.
5645 if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB ||
5646 OpBExtend == TTI::PR_None ||
5647 (OpAExtend != OpBExtend && !ST->hasMatMulInt8() &&
5648 !ST->isSVEorStreamingSVEAvailable())))
5649 return Invalid;
5650 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5651 "Unexpected values for OpBExtend or InputTypeB");
5652
5653 EVT InputEVT = EVT::getEVT(InputTypeA);
5654 EVT AccumEVT = EVT::getEVT(AccumType);
5655
5656 unsigned VFMinValue = VF.getKnownMinValue();
5657
5658 if (VF.isScalable()) {
5659 if (!ST->isSVEorStreamingSVEAvailable())
5660 return Invalid;
5661
5662 // Don't accept a partial reduction if the scaled accumulator is vscale x 1,
5663 // since we can't lower that type.
5664 unsigned Scale =
5665 AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits();
5666 if (VFMinValue == Scale)
5667 return Invalid;
5668 }
5669 if (VF.isFixed() &&
5670 (!ST->isNeonAvailable() || !ST->hasDotProd() || AccumEVT == MVT::i64))
5671 return Invalid;
5672
5673 if (InputEVT == MVT::i8) {
5674 switch (VFMinValue) {
5675 default:
5676 return Invalid;
5677 case 8:
5678 if (AccumEVT == MVT::i32)
5679 Cost *= 2;
5680 else if (AccumEVT != MVT::i64)
5681 return Invalid;
5682 break;
5683 case 16:
5684 if (AccumEVT == MVT::i64)
5685 Cost *= 2;
5686 else if (AccumEVT != MVT::i32)
5687 return Invalid;
5688 break;
5689 }
5690 } else if (InputEVT == MVT::i16) {
5691 // FIXME: Allow i32 accumulator but increase cost, as we would extend
5692 // it to i64.
5693 if (VFMinValue != 8 || AccumEVT != MVT::i64)
5694 return Invalid;
5695 } else
5696 return Invalid;
5697
5698 return Cost;
5699}
5700
5703 VectorType *SrcTy, ArrayRef<int> Mask,
5704 TTI::TargetCostKind CostKind, int Index,
5706 const Instruction *CxtI) const {
5707 assert((Mask.empty() || DstTy->isScalableTy() ||
5708 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
5709 "Expected the Mask to match the return size if given");
5710 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
5711 "Expected the same scalar types");
5712 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
5713
5714 // If we have a Mask, and the LT is being legalized somehow, split the Mask
5715 // into smaller vectors and sum the cost of each shuffle.
5716 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
5717 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
5718 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
5719 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
5720 // Check for LD3/LD4 instructions, which are represented in llvm IR as
5721 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
5722 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
5723 // cost than just the load.
5724 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
5727 return std::max<InstructionCost>(1, LT.first / 4);
5728
5729 // Check for ST3/ST4 instructions, which are represented in llvm IR as
5730 // store(interleaving-shuffle). The shuffle cost could potentially be free,
5731 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
5732 // cost than just the store.
5733 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
5735 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
5737 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
5738 return LT.first;
5739
5740 unsigned TpNumElts = Mask.size();
5741 unsigned LTNumElts = LT.second.getVectorNumElements();
5742 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
5743 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
5744 LT.second.getVectorElementCount());
5746 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
5747 PreviousCosts;
5748 for (unsigned N = 0; N < NumVecs; N++) {
5749 SmallVector<int> NMask;
5750 // Split the existing mask into chunks of size LTNumElts. Track the source
5751 // sub-vectors to ensure the result has at most 2 inputs.
5752 unsigned Source1 = -1U, Source2 = -1U;
5753 unsigned NumSources = 0;
5754 for (unsigned E = 0; E < LTNumElts; E++) {
5755 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
5757 if (MaskElt < 0) {
5759 continue;
5760 }
5761
5762 // Calculate which source from the input this comes from and whether it
5763 // is new to us.
5764 unsigned Source = MaskElt / LTNumElts;
5765 if (NumSources == 0) {
5766 Source1 = Source;
5767 NumSources = 1;
5768 } else if (NumSources == 1 && Source != Source1) {
5769 Source2 = Source;
5770 NumSources = 2;
5771 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
5772 NumSources++;
5773 }
5774
5775 // Add to the new mask. For the NumSources>2 case these are not correct,
5776 // but are only used for the modular lane number.
5777 if (Source == Source1)
5778 NMask.push_back(MaskElt % LTNumElts);
5779 else if (Source == Source2)
5780 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
5781 else
5782 NMask.push_back(MaskElt % LTNumElts);
5783 }
5784 // Check if we have already generated this sub-shuffle, which means we
5785 // will have already generated the output. For example a <16 x i32> splat
5786 // will be the same sub-splat 4 times, which only needs to be generated
5787 // once and reused.
5788 auto Result =
5789 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
5790 // Check if it was already in the map (already costed).
5791 if (!Result.second)
5792 continue;
5793 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
5794 // getShuffleCost. If not then cost it using the worst case as the number
5795 // of element moves into a new vector.
5796 InstructionCost NCost =
5797 NumSources <= 2
5798 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
5800 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
5801 CxtI)
5802 : LTNumElts;
5803 Result.first->second = NCost;
5804 Cost += NCost;
5805 }
5806 return Cost;
5807 }
5808
5809 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
5810 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
5811 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
5812 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
5813 // This currently only handles low or high extracts to prevent SLP vectorizer
5814 // regressions.
5815 // Note that SVE's ext instruction is destructive, but it can be fused with
5816 // a movprfx to act like a constructive instruction.
5817 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
5818 if (LT.second.getFixedSizeInBits() >= 128 &&
5819 cast<FixedVectorType>(SubTp)->getNumElements() ==
5820 LT.second.getVectorNumElements() / 2) {
5821 if (Index == 0)
5822 return 0;
5823 if (Index == (int)LT.second.getVectorNumElements() / 2)
5824 return 1;
5825 }
5827 }
5828 // FIXME: This was added to keep the costs equal when adding DstTys. Update
5829 // the code to handle length-changing shuffles.
5830 if (Kind == TTI::SK_InsertSubvector) {
5831 LT = getTypeLegalizationCost(DstTy);
5832 SrcTy = DstTy;
5833 }
5834
5835 // Segmented shuffle matching.
5836 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
5837 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
5838 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
5840
5842 unsigned Segments =
5844 unsigned SegmentElts = VTy->getNumElements() / Segments;
5845
5846 // dupq zd.t, zn.t[idx]
5847 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
5848 ST->isSVEorStreamingSVEAvailable() &&
5849 isDUPQMask(Mask, Segments, SegmentElts))
5850 return LT.first;
5851
5852 // mov zd.q, vn
5853 if (ST->isSVEorStreamingSVEAvailable() &&
5854 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
5855 return LT.first;
5856 }
5857
5858 // Check for broadcast loads, which are supported by the LD1R instruction.
5859 // In terms of code-size, the shuffle vector is free when a load + dup get
5860 // folded into a LD1R. That's what we check and return here. For performance
5861 // and reciprocal throughput, a LD1R is not completely free. In this case, we
5862 // return the cost for the broadcast below (i.e. 1 for most/all types), so
5863 // that we model the load + dup sequence slightly higher because LD1R is a
5864 // high latency instruction.
5865 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
5866 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
5867 if (IsLoad && LT.second.isVector() &&
5868 isLegalBroadcastLoad(SrcTy->getElementType(),
5869 LT.second.getVectorElementCount()))
5870 return 0;
5871 }
5872
5873 // If we have 4 elements for the shuffle and a Mask, get the cost straight
5874 // from the perfect shuffle tables.
5875 if (Mask.size() == 4 &&
5876 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
5877 (SrcTy->getScalarSizeInBits() == 16 ||
5878 SrcTy->getScalarSizeInBits() == 32) &&
5879 all_of(Mask, [](int E) { return E < 8; }))
5880 return getPerfectShuffleCost(Mask);
5881
5882 // Check for identity masks, which we can treat as free.
5883 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
5884 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
5885 all_of(enumerate(Mask), [](const auto &M) {
5886 return M.value() < 0 || M.value() == (int)M.index();
5887 }))
5888 return 0;
5889
5890 // Check for other shuffles that are not SK_ kinds but we have native
5891 // instructions for, for example ZIP and UZP.
5892 unsigned Unused;
5893 if (LT.second.isFixedLengthVector() &&
5894 LT.second.getVectorNumElements() == Mask.size() &&
5895 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
5896 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
5897 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
5898 isREVMask(Mask, LT.second.getScalarSizeInBits(),
5899 LT.second.getVectorNumElements(), 16) ||
5900 isREVMask(Mask, LT.second.getScalarSizeInBits(),
5901 LT.second.getVectorNumElements(), 32) ||
5902 isREVMask(Mask, LT.second.getScalarSizeInBits(),
5903 LT.second.getVectorNumElements(), 64) ||
5904 // Check for non-zero lane splats
5905 all_of(drop_begin(Mask),
5906 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
5907 return 1;
5908
5909 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
5910 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
5911 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
5912 static const CostTblEntry ShuffleTbl[] = {
5913 // Broadcast shuffle kinds can be performed with 'dup'.
5914 {TTI::SK_Broadcast, MVT::v8i8, 1},
5915 {TTI::SK_Broadcast, MVT::v16i8, 1},
5916 {TTI::SK_Broadcast, MVT::v4i16, 1},
5917 {TTI::SK_Broadcast, MVT::v8i16, 1},
5918 {TTI::SK_Broadcast, MVT::v2i32, 1},
5919 {TTI::SK_Broadcast, MVT::v4i32, 1},
5920 {TTI::SK_Broadcast, MVT::v2i64, 1},
5921 {TTI::SK_Broadcast, MVT::v4f16, 1},
5922 {TTI::SK_Broadcast, MVT::v8f16, 1},
5923 {TTI::SK_Broadcast, MVT::v4bf16, 1},
5924 {TTI::SK_Broadcast, MVT::v8bf16, 1},
5925 {TTI::SK_Broadcast, MVT::v2f32, 1},
5926 {TTI::SK_Broadcast, MVT::v4f32, 1},
5927 {TTI::SK_Broadcast, MVT::v2f64, 1},
5928 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
5929 // 'zip1/zip2' instructions.
5930 {TTI::SK_Transpose, MVT::v8i8, 1},
5931 {TTI::SK_Transpose, MVT::v16i8, 1},
5932 {TTI::SK_Transpose, MVT::v4i16, 1},
5933 {TTI::SK_Transpose, MVT::v8i16, 1},
5934 {TTI::SK_Transpose, MVT::v2i32, 1},
5935 {TTI::SK_Transpose, MVT::v4i32, 1},
5936 {TTI::SK_Transpose, MVT::v2i64, 1},
5937 {TTI::SK_Transpose, MVT::v4f16, 1},
5938 {TTI::SK_Transpose, MVT::v8f16, 1},
5939 {TTI::SK_Transpose, MVT::v4bf16, 1},
5940 {TTI::SK_Transpose, MVT::v8bf16, 1},
5941 {TTI::SK_Transpose, MVT::v2f32, 1},
5942 {TTI::SK_Transpose, MVT::v4f32, 1},
5943 {TTI::SK_Transpose, MVT::v2f64, 1},
5944 // Select shuffle kinds.
5945 // TODO: handle vXi8/vXi16.
5946 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
5947 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
5948 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
5949 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
5950 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
5951 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
5952 // PermuteSingleSrc shuffle kinds.
5953 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
5954 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
5955 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
5956 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
5957 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
5958 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
5959 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
5960 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
5961 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
5962 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
5963 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
5964 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
5965 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
5966 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
5967 // Reverse can be lowered with `rev`.
5968 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
5969 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
5970 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
5971 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
5972 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
5973 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
5974 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
5975 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
5976 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
5977 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
5978 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
5979 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
5980 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
5981 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
5982 // Splice can all be lowered as `ext`.
5983 {TTI::SK_Splice, MVT::v2i32, 1},
5984 {TTI::SK_Splice, MVT::v4i32, 1},
5985 {TTI::SK_Splice, MVT::v2i64, 1},
5986 {TTI::SK_Splice, MVT::v2f32, 1},
5987 {TTI::SK_Splice, MVT::v4f32, 1},
5988 {TTI::SK_Splice, MVT::v2f64, 1},
5989 {TTI::SK_Splice, MVT::v8f16, 1},
5990 {TTI::SK_Splice, MVT::v8bf16, 1},
5991 {TTI::SK_Splice, MVT::v8i16, 1},
5992 {TTI::SK_Splice, MVT::v16i8, 1},
5993 {TTI::SK_Splice, MVT::v4f16, 1},
5994 {TTI::SK_Splice, MVT::v4bf16, 1},
5995 {TTI::SK_Splice, MVT::v4i16, 1},
5996 {TTI::SK_Splice, MVT::v8i8, 1},
5997 // Broadcast shuffle kinds for scalable vectors
5998 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
5999 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6000 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6001 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6002 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6003 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6004 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6005 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6006 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6007 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6008 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6009 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6010 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6011 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6012 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6013 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6014 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6015 // Handle the cases for vector.reverse with scalable vectors
6016 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6017 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6018 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6019 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6020 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6021 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6022 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6023 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6024 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6025 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6026 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6027 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6028 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6029 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6030 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6031 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6032 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6033 };
6034 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6035 return LT.first * Entry->Cost;
6036 }
6037
6038 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6039 return getSpliceCost(SrcTy, Index, CostKind);
6040
6041 // Inserting a subvector can often be done with either a D, S or H register
6042 // move, so long as the inserted vector is "aligned".
6043 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6044 LT.second.getSizeInBits() <= 128 && SubTp) {
6045 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6046 if (SubLT.second.isVector()) {
6047 int NumElts = LT.second.getVectorNumElements();
6048 int NumSubElts = SubLT.second.getVectorNumElements();
6049 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6050 return SubLT.first;
6051 }
6052 }
6053
6054 // Restore optimal kind.
6055 if (IsExtractSubvector)
6057 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6058 Args, CxtI);
6059}
6060
6063 const auto &Strides = DenseMap<Value *, const SCEV *>();
6064 for (BasicBlock *BB : TheLoop->blocks()) {
6065 // Scan the instructions in the block and look for addresses that are
6066 // consecutive and decreasing.
6067 for (Instruction &I : *BB) {
6068 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6070 Type *AccessTy = getLoadStoreType(&I);
6071 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
6072 /*ShouldCheckWrap=*/false)
6073 .value_or(0) < 0)
6074 return true;
6075 }
6076 }
6077 }
6078 return false;
6079}
6080
6082 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6084 // For cases like post-LTO vectorization, when we eventually know the trip
6085 // count, epilogue with fixed-width vectorization can be deleted if the trip
6086 // count is less than the epilogue iterations. That's why we prefer
6087 // fixed-width vectorization in epilogue in case of equal costs.
6088 if (IsEpilogue)
6089 return true;
6090 return ST->useFixedOverScalableIfEqualCost();
6091}
6092
6094 return ST->getEpilogueVectorizationMinVF();
6095}
6096
6098 if (!ST->hasSVE())
6099 return false;
6100
6101 // We don't currently support vectorisation with interleaving for SVE - with
6102 // such loops we're better off not using tail-folding. This gives us a chance
6103 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6104 if (TFI->IAI->hasGroups())
6105 return false;
6106
6108 if (TFI->LVL->getReductionVars().size())
6110 if (TFI->LVL->getFixedOrderRecurrences().size())
6112
6113 // We call this to discover whether any load/store pointers in the loop have
6114 // negative strides. This will require extra work to reverse the loop
6115 // predicate, which may be expensive.
6121
6122 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6123 Required))
6124 return false;
6125
6126 // Don't tail-fold for tight loops where we would be better off interleaving
6127 // with an unpredicated loop.
6128 unsigned NumInsns = 0;
6129 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6130 NumInsns += BB->sizeWithoutDebug();
6131 }
6132
6133 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6134 return NumInsns >= SVETailFoldInsnThreshold;
6135}
6136
6139 StackOffset BaseOffset, bool HasBaseReg,
6140 int64_t Scale, unsigned AddrSpace) const {
6141 // Scaling factors are not free at all.
6142 // Operands | Rt Latency
6143 // -------------------------------------------
6144 // Rt, [Xn, Xm] | 4
6145 // -------------------------------------------
6146 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6147 // Rt, [Xn, Wm, <extend> #imm] |
6149 AM.BaseGV = BaseGV;
6150 AM.BaseOffs = BaseOffset.getFixed();
6151 AM.HasBaseReg = HasBaseReg;
6152 AM.Scale = Scale;
6153 AM.ScalableOffset = BaseOffset.getScalable();
6154 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6155 // Scale represents reg2 * scale, thus account for 1 if
6156 // it is not equal to 0 or 1.
6157 return AM.Scale != 0 && AM.Scale != 1;
6159}
6160
6162 const Instruction *I) const {
6164 // For the binary operators (e.g. or) we need to be more careful than
6165 // selects, here we only transform them if they are already at a natural
6166 // break point in the code - the end of a block with an unconditional
6167 // terminator.
6168 if (I->getOpcode() == Instruction::Or &&
6169 isa<BranchInst>(I->getNextNode()) &&
6170 cast<BranchInst>(I->getNextNode())->isUnconditional())
6171 return true;
6172
6173 if (I->getOpcode() == Instruction::Add ||
6174 I->getOpcode() == Instruction::Sub)
6175 return true;
6176 }
6178}
6179
6182 const TargetTransformInfo::LSRCost &C2) const {
6183 // AArch64 specific here is adding the number of instructions to the
6184 // comparison (though not as the first consideration, as some targets do)
6185 // along with changing the priority of the base additions.
6186 // TODO: Maybe a more nuanced tradeoff between instruction count
6187 // and number of registers? To be investigated at a later date.
6188 if (EnableLSRCostOpt)
6189 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6190 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6191 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6192 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6193
6195}
6196
6197static bool isSplatShuffle(Value *V) {
6198 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6199 return all_equal(Shuf->getShuffleMask());
6200 return false;
6201}
6202
6203/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6204/// or upper half of the vector elements.
6205static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6206 bool AllowSplat = false) {
6207 // Scalable types can't be extract shuffle vectors.
6208 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6209 return false;
6210
6211 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6212 auto *FullTy = FullV->getType();
6213 auto *HalfTy = HalfV->getType();
6214 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6215 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6216 };
6217
6218 auto extractHalf = [](Value *FullV, Value *HalfV) {
6219 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6220 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6221 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6222 };
6223
6224 ArrayRef<int> M1, M2;
6225 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6226 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6227 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6228 return false;
6229
6230 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6231 // it is not checked as an extract below.
6232 if (AllowSplat && isSplatShuffle(Op1))
6233 S1Op1 = nullptr;
6234 if (AllowSplat && isSplatShuffle(Op2))
6235 S2Op1 = nullptr;
6236
6237 // Check that the operands are half as wide as the result and we extract
6238 // half of the elements of the input vectors.
6239 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6240 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6241 return false;
6242
6243 // Check the mask extracts either the lower or upper half of vector
6244 // elements.
6245 int M1Start = 0;
6246 int M2Start = 0;
6247 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6248 if ((S1Op1 &&
6249 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6250 (S2Op1 &&
6251 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6252 return false;
6253
6254 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6255 (M2Start != 0 && M2Start != (NumElements / 2)))
6256 return false;
6257 if (S1Op1 && S2Op1 && M1Start != M2Start)
6258 return false;
6259
6260 return true;
6261}
6262
6263/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6264/// of the vector elements.
6265static bool areExtractExts(Value *Ext1, Value *Ext2) {
6266 auto areExtDoubled = [](Instruction *Ext) {
6267 return Ext->getType()->getScalarSizeInBits() ==
6268 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6269 };
6270
6271 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6272 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6273 !areExtDoubled(cast<Instruction>(Ext1)) ||
6274 !areExtDoubled(cast<Instruction>(Ext2)))
6275 return false;
6276
6277 return true;
6278}
6279
6280/// Check if Op could be used with vmull_high_p64 intrinsic.
6282 Value *VectorOperand = nullptr;
6283 ConstantInt *ElementIndex = nullptr;
6284 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6285 m_ConstantInt(ElementIndex))) &&
6286 ElementIndex->getValue() == 1 &&
6287 isa<FixedVectorType>(VectorOperand->getType()) &&
6288 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6289}
6290
6291/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6292static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6294}
6295
6297 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6298 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6299 if (!GEP || GEP->getNumOperands() != 2)
6300 return false;
6301
6302 Value *Base = GEP->getOperand(0);
6303 Value *Offsets = GEP->getOperand(1);
6304
6305 // We only care about scalar_base+vector_offsets.
6306 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6307 return false;
6308
6309 // Sink extends that would allow us to use 32-bit offset vectors.
6310 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6311 auto *OffsetsInst = cast<Instruction>(Offsets);
6312 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6313 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6314 Ops.push_back(&GEP->getOperandUse(1));
6315 }
6316
6317 // Sink the GEP.
6318 return true;
6319}
6320
6321/// We want to sink following cases:
6322/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6323/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6325 if (match(Op, m_VScale()))
6326 return true;
6327 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6329 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6330 return true;
6331 }
6332 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6334 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6335 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6336 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6337 return true;
6338 }
6339 return false;
6340}
6341
6342/// Check if sinking \p I's operands to I's basic block is profitable, because
6343/// the operands can be folded into a target instruction, e.g.
6344/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6348 switch (II->getIntrinsicID()) {
6349 case Intrinsic::aarch64_neon_smull:
6350 case Intrinsic::aarch64_neon_umull:
6351 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6352 /*AllowSplat=*/true)) {
6353 Ops.push_back(&II->getOperandUse(0));
6354 Ops.push_back(&II->getOperandUse(1));
6355 return true;
6356 }
6357 [[fallthrough]];
6358
6359 case Intrinsic::fma:
6360 case Intrinsic::fmuladd:
6361 if (isa<VectorType>(I->getType()) &&
6362 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6363 !ST->hasFullFP16())
6364 return false;
6365 [[fallthrough]];
6366 case Intrinsic::aarch64_neon_sqdmull:
6367 case Intrinsic::aarch64_neon_sqdmulh:
6368 case Intrinsic::aarch64_neon_sqrdmulh:
6369 // Sink splats for index lane variants
6370 if (isSplatShuffle(II->getOperand(0)))
6371 Ops.push_back(&II->getOperandUse(0));
6372 if (isSplatShuffle(II->getOperand(1)))
6373 Ops.push_back(&II->getOperandUse(1));
6374 return !Ops.empty();
6375 case Intrinsic::aarch64_neon_fmlal:
6376 case Intrinsic::aarch64_neon_fmlal2:
6377 case Intrinsic::aarch64_neon_fmlsl:
6378 case Intrinsic::aarch64_neon_fmlsl2:
6379 // Sink splats for index lane variants
6380 if (isSplatShuffle(II->getOperand(1)))
6381 Ops.push_back(&II->getOperandUse(1));
6382 if (isSplatShuffle(II->getOperand(2)))
6383 Ops.push_back(&II->getOperandUse(2));
6384 return !Ops.empty();
6385 case Intrinsic::aarch64_sve_ptest_first:
6386 case Intrinsic::aarch64_sve_ptest_last:
6387 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6388 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6389 Ops.push_back(&II->getOperandUse(0));
6390 return !Ops.empty();
6391 case Intrinsic::aarch64_sme_write_horiz:
6392 case Intrinsic::aarch64_sme_write_vert:
6393 case Intrinsic::aarch64_sme_writeq_horiz:
6394 case Intrinsic::aarch64_sme_writeq_vert: {
6395 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6396 if (!Idx || Idx->getOpcode() != Instruction::Add)
6397 return false;
6398 Ops.push_back(&II->getOperandUse(1));
6399 return true;
6400 }
6401 case Intrinsic::aarch64_sme_read_horiz:
6402 case Intrinsic::aarch64_sme_read_vert:
6403 case Intrinsic::aarch64_sme_readq_horiz:
6404 case Intrinsic::aarch64_sme_readq_vert:
6405 case Intrinsic::aarch64_sme_ld1b_vert:
6406 case Intrinsic::aarch64_sme_ld1h_vert:
6407 case Intrinsic::aarch64_sme_ld1w_vert:
6408 case Intrinsic::aarch64_sme_ld1d_vert:
6409 case Intrinsic::aarch64_sme_ld1q_vert:
6410 case Intrinsic::aarch64_sme_st1b_vert:
6411 case Intrinsic::aarch64_sme_st1h_vert:
6412 case Intrinsic::aarch64_sme_st1w_vert:
6413 case Intrinsic::aarch64_sme_st1d_vert:
6414 case Intrinsic::aarch64_sme_st1q_vert:
6415 case Intrinsic::aarch64_sme_ld1b_horiz:
6416 case Intrinsic::aarch64_sme_ld1h_horiz:
6417 case Intrinsic::aarch64_sme_ld1w_horiz:
6418 case Intrinsic::aarch64_sme_ld1d_horiz:
6419 case Intrinsic::aarch64_sme_ld1q_horiz:
6420 case Intrinsic::aarch64_sme_st1b_horiz:
6421 case Intrinsic::aarch64_sme_st1h_horiz:
6422 case Intrinsic::aarch64_sme_st1w_horiz:
6423 case Intrinsic::aarch64_sme_st1d_horiz:
6424 case Intrinsic::aarch64_sme_st1q_horiz: {
6425 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6426 if (!Idx || Idx->getOpcode() != Instruction::Add)
6427 return false;
6428 Ops.push_back(&II->getOperandUse(3));
6429 return true;
6430 }
6431 case Intrinsic::aarch64_neon_pmull:
6432 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6433 return false;
6434 Ops.push_back(&II->getOperandUse(0));
6435 Ops.push_back(&II->getOperandUse(1));
6436 return true;
6437 case Intrinsic::aarch64_neon_pmull64:
6438 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6439 II->getArgOperand(1)))
6440 return false;
6441 Ops.push_back(&II->getArgOperandUse(0));
6442 Ops.push_back(&II->getArgOperandUse(1));
6443 return true;
6444 case Intrinsic::masked_gather:
6445 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6446 return false;
6447 Ops.push_back(&II->getArgOperandUse(0));
6448 return true;
6449 case Intrinsic::masked_scatter:
6450 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6451 return false;
6452 Ops.push_back(&II->getArgOperandUse(1));
6453 return true;
6454 default:
6455 return false;
6456 }
6457 }
6458
6459 auto ShouldSinkCondition = [](Value *Cond,
6460 SmallVectorImpl<Use *> &Ops) -> bool {
6462 return false;
6464 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6465 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
6466 return false;
6467 if (isa<CmpInst>(II->getOperand(0)))
6468 Ops.push_back(&II->getOperandUse(0));
6469 return true;
6470 };
6471
6472 switch (I->getOpcode()) {
6473 case Instruction::GetElementPtr:
6474 case Instruction::Add:
6475 case Instruction::Sub:
6476 // Sink vscales closer to uses for better isel
6477 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6478 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6479 Ops.push_back(&I->getOperandUse(Op));
6480 return true;
6481 }
6482 }
6483 break;
6484 case Instruction::Select: {
6485 if (!ShouldSinkCondition(I->getOperand(0), Ops))
6486 return false;
6487
6488 Ops.push_back(&I->getOperandUse(0));
6489 return true;
6490 }
6491 case Instruction::Br: {
6492 if (cast<BranchInst>(I)->isUnconditional())
6493 return false;
6494
6495 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops))
6496 return false;
6497
6498 Ops.push_back(&I->getOperandUse(0));
6499 return true;
6500 }
6501 default:
6502 break;
6503 }
6504
6505 if (!I->getType()->isVectorTy())
6506 return false;
6507
6508 switch (I->getOpcode()) {
6509 case Instruction::Sub:
6510 case Instruction::Add: {
6511 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
6512 return false;
6513
6514 // If the exts' operands extract either the lower or upper elements, we
6515 // can sink them too.
6516 auto Ext1 = cast<Instruction>(I->getOperand(0));
6517 auto Ext2 = cast<Instruction>(I->getOperand(1));
6518 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
6519 Ops.push_back(&Ext1->getOperandUse(0));
6520 Ops.push_back(&Ext2->getOperandUse(0));
6521 }
6522
6523 Ops.push_back(&I->getOperandUse(0));
6524 Ops.push_back(&I->getOperandUse(1));
6525
6526 return true;
6527 }
6528 case Instruction::Or: {
6529 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6530 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6531 if (ST->hasNEON()) {
6532 Instruction *OtherAnd, *IA, *IB;
6533 Value *MaskValue;
6534 // MainAnd refers to And instruction that has 'Not' as one of its operands
6535 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
6536 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
6537 m_Instruction(IA)))))) {
6538 if (match(OtherAnd,
6539 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
6540 Instruction *MainAnd = I->getOperand(0) == OtherAnd
6541 ? cast<Instruction>(I->getOperand(1))
6542 : cast<Instruction>(I->getOperand(0));
6543
6544 // Both Ands should be in same basic block as Or
6545 if (I->getParent() != MainAnd->getParent() ||
6546 I->getParent() != OtherAnd->getParent())
6547 return false;
6548
6549 // Non-mask operands of both Ands should also be in same basic block
6550 if (I->getParent() != IA->getParent() ||
6551 I->getParent() != IB->getParent())
6552 return false;
6553
6554 Ops.push_back(
6555 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
6556 Ops.push_back(&I->getOperandUse(0));
6557 Ops.push_back(&I->getOperandUse(1));
6558
6559 return true;
6560 }
6561 }
6562 }
6563
6564 return false;
6565 }
6566 case Instruction::Mul: {
6567 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6568 auto *Ty = cast<VectorType>(V->getType());
6569 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6570 if (Ty->isScalableTy())
6571 return false;
6572
6573 // Indexed variants of Mul exist for i16 and i32 element types only.
6574 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
6575 };
6576
6577 int NumZExts = 0, NumSExts = 0;
6578 for (auto &Op : I->operands()) {
6579 // Make sure we are not already sinking this operand
6580 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
6581 continue;
6582
6583 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
6584 auto *Ext = cast<Instruction>(Op);
6585 auto *ExtOp = Ext->getOperand(0);
6586 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
6587 Ops.push_back(&Ext->getOperandUse(0));
6588 Ops.push_back(&Op);
6589
6590 if (isa<SExtInst>(Ext))
6591 NumSExts++;
6592 else
6593 NumZExts++;
6594
6595 continue;
6596 }
6597
6599 if (!Shuffle)
6600 continue;
6601
6602 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
6603 // operand and the s/zext can help create indexed s/umull. This is
6604 // especially useful to prevent i64 mul being scalarized.
6605 if (isSplatShuffle(Shuffle) &&
6606 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
6607 Ops.push_back(&Shuffle->getOperandUse(0));
6608 Ops.push_back(&Op);
6609 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
6610 NumSExts++;
6611 else
6612 NumZExts++;
6613 continue;
6614 }
6615
6616 Value *ShuffleOperand = Shuffle->getOperand(0);
6617 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
6618 if (!Insert)
6619 continue;
6620
6621 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
6622 if (!OperandInstr)
6623 continue;
6624
6625 ConstantInt *ElementConstant =
6626 dyn_cast<ConstantInt>(Insert->getOperand(2));
6627 // Check that the insertelement is inserting into element 0
6628 if (!ElementConstant || !ElementConstant->isZero())
6629 continue;
6630
6631 unsigned Opcode = OperandInstr->getOpcode();
6632 if (Opcode == Instruction::SExt)
6633 NumSExts++;
6634 else if (Opcode == Instruction::ZExt)
6635 NumZExts++;
6636 else {
6637 // If we find that the top bits are known 0, then we can sink and allow
6638 // the backend to generate a umull.
6639 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
6640 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
6641 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
6642 continue;
6643 NumZExts++;
6644 }
6645
6646 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
6647 // the And, just to hoist it again back to the load.
6648 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
6649 Ops.push_back(&Insert->getOperandUse(1));
6650 Ops.push_back(&Shuffle->getOperandUse(0));
6651 Ops.push_back(&Op);
6652 }
6653
6654 // It is profitable to sink if we found two of the same type of extends.
6655 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
6656 return true;
6657
6658 // Otherwise, see if we should sink splats for indexed variants.
6659 if (!ShouldSinkSplatForIndexedVariant(I))
6660 return false;
6661
6662 Ops.clear();
6663 if (isSplatShuffle(I->getOperand(0)))
6664 Ops.push_back(&I->getOperandUse(0));
6665 if (isSplatShuffle(I->getOperand(1)))
6666 Ops.push_back(&I->getOperandUse(1));
6667
6668 return !Ops.empty();
6669 }
6670 case Instruction::FMul: {
6671 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6672 if (I->getType()->isScalableTy())
6673 return false;
6674
6675 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6676 !ST->hasFullFP16())
6677 return false;
6678
6679 // Sink splats for index lane variants
6680 if (isSplatShuffle(I->getOperand(0)))
6681 Ops.push_back(&I->getOperandUse(0));
6682 if (isSplatShuffle(I->getOperand(1)))
6683 Ops.push_back(&I->getOperandUse(1));
6684 return !Ops.empty();
6685 }
6686 default:
6687 return false;
6688 }
6689 return false;
6690}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static std::optional< Instruction * > instCombineSMECntsElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts, const AArch64Subtarget *ST)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
Hexagon Common GEP
const HexagonInstrInfo * TII
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
static unsigned getNumElements(Type *Ty)
#define LLVM_DEBUG(...)
Definition Debug.h:119
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const override
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool enableScalableVectorization() const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
unsigned countLeadingOnes() const
Definition APInt.h:1624
void negate()
Negate this APInt in place.
Definition APInt.h:1468
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:681
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:708
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:684
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:682
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:683
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:686
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:689
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:685
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:687
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:706
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:694
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:688
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:778
bool isUnsigned() const
Definition InstrTypes.h:938
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:671
bool empty() const
Definition DenseMap.h:107
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:156
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowContract() const
Definition FMF.h:69
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2571
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1107
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2559
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:595
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:580
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:1923
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2286
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2494
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1714
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2204
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1847
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2593
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1860
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:590
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2277
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
This instruction inserts a single (scalar) element into a VectorType value.
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition MapVector.h:56
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:825
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:31
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:41
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:40
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:710
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
virtual const DataLayout & getDataLayout() const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:62
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:285
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:330
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
@ Uninitialized
Definition Threading.h:60
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:336
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1741
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1899
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2110
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:378
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:279
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:380
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:311
bool isFixedLengthVector() const
Definition ValueTypes.h:181
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:318
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:323
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:331
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
bool isVariant() const
Definition MCSchedule.h:144
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...