LLVM 22.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
29#include <algorithm>
30#include <optional>
31using namespace llvm;
32using namespace llvm::PatternMatch;
33
34#define DEBUG_TYPE "aarch64tti"
35
36static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
37 cl::init(true), cl::Hidden);
38
40 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
41
42static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
44
45static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
46 cl::init(10), cl::Hidden);
47
48static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
49 cl::init(15), cl::Hidden);
50
52 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
54
56 "call-penalty-sm-change", cl::init(5), cl::Hidden,
58 "Penalty of calling a function that requires a change to PSTATE.SM"));
59
61 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
62 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
63
64static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
65 cl::init(true), cl::Hidden);
66
67static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
68 cl::init(true), cl::Hidden);
69
70// A complete guess as to a reasonable cost.
72 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
73 cl::desc("The cost of a histcnt instruction"));
74
76 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
77 cl::desc("The number of instructions to search for a redundant dmb"));
78
79namespace {
80class TailFoldingOption {
81 // These bitfields will only ever be set to something non-zero in operator=,
82 // when setting the -sve-tail-folding option. This option should always be of
83 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
84 // InitialBits is one of (disabled|all|simple). EnableBits represents
85 // additional flags we're enabling, and DisableBits for those flags we're
86 // disabling. The default flag is tracked in the variable NeedsDefault, since
87 // at the time of setting the option we may not know what the default value
88 // for the CPU is.
89 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
90 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
91 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
92
93 // This value needs to be initialised to true in case the user does not
94 // explicitly set the -sve-tail-folding option.
95 bool NeedsDefault = true;
96
97 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
98
99 void setNeedsDefault(bool V) { NeedsDefault = V; }
100
101 void setEnableBit(TailFoldingOpts Bit) {
102 EnableBits |= Bit;
103 DisableBits &= ~Bit;
104 }
105
106 void setDisableBit(TailFoldingOpts Bit) {
107 EnableBits &= ~Bit;
108 DisableBits |= Bit;
109 }
110
111 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
112 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
113
114 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
115 "Initial bits should only include one of "
116 "(disabled|all|simple|default)");
117 Bits = NeedsDefault ? DefaultBits : InitialBits;
118 Bits |= EnableBits;
119 Bits &= ~DisableBits;
120
121 return Bits;
122 }
123
124 void reportError(std::string Opt) {
125 errs() << "invalid argument '" << Opt
126 << "' to -sve-tail-folding=; the option should be of the form\n"
127 " (disabled|all|default|simple)[+(reductions|recurrences"
128 "|reverse|noreductions|norecurrences|noreverse)]\n";
129 report_fatal_error("Unrecognised tail-folding option");
130 }
131
132public:
133
134 void operator=(const std::string &Val) {
135 // If the user explicitly sets -sve-tail-folding= then treat as an error.
136 if (Val.empty()) {
137 reportError("");
138 return;
139 }
140
141 // Since the user is explicitly setting the option we don't automatically
142 // need the default unless they require it.
143 setNeedsDefault(false);
144
145 SmallVector<StringRef, 4> TailFoldTypes;
146 StringRef(Val).split(TailFoldTypes, '+', -1, false);
147
148 unsigned StartIdx = 1;
149 if (TailFoldTypes[0] == "disabled")
150 setInitialBits(TailFoldingOpts::Disabled);
151 else if (TailFoldTypes[0] == "all")
152 setInitialBits(TailFoldingOpts::All);
153 else if (TailFoldTypes[0] == "default")
154 setNeedsDefault(true);
155 else if (TailFoldTypes[0] == "simple")
156 setInitialBits(TailFoldingOpts::Simple);
157 else {
158 StartIdx = 0;
159 setInitialBits(TailFoldingOpts::Disabled);
160 }
161
162 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
163 if (TailFoldTypes[I] == "reductions")
164 setEnableBit(TailFoldingOpts::Reductions);
165 else if (TailFoldTypes[I] == "recurrences")
166 setEnableBit(TailFoldingOpts::Recurrences);
167 else if (TailFoldTypes[I] == "reverse")
168 setEnableBit(TailFoldingOpts::Reverse);
169 else if (TailFoldTypes[I] == "noreductions")
170 setDisableBit(TailFoldingOpts::Reductions);
171 else if (TailFoldTypes[I] == "norecurrences")
172 setDisableBit(TailFoldingOpts::Recurrences);
173 else if (TailFoldTypes[I] == "noreverse")
174 setDisableBit(TailFoldingOpts::Reverse);
175 else
176 reportError(Val);
177 }
178 }
179
180 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
181 return (getBits(DefaultBits) & Required) == Required;
182 }
183};
184} // namespace
185
186TailFoldingOption TailFoldingOptionLoc;
187
189 "sve-tail-folding",
190 cl::desc(
191 "Control the use of vectorisation using tail-folding for SVE where the"
192 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
193 "\ndisabled (Initial) No loop types will vectorize using "
194 "tail-folding"
195 "\ndefault (Initial) Uses the default tail-folding settings for "
196 "the target CPU"
197 "\nall (Initial) All legal loop types will vectorize using "
198 "tail-folding"
199 "\nsimple (Initial) Use tail-folding for simple loops (not "
200 "reductions or recurrences)"
201 "\nreductions Use tail-folding for loops containing reductions"
202 "\nnoreductions Inverse of above"
203 "\nrecurrences Use tail-folding for loops containing fixed order "
204 "recurrences"
205 "\nnorecurrences Inverse of above"
206 "\nreverse Use tail-folding for loops requiring reversed "
207 "predicates"
208 "\nnoreverse Inverse of above"),
210
211// Experimental option that will only be fully functional when the
212// code-generator is changed to use SVE instead of NEON for all fixed-width
213// operations.
215 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
216
217// Experimental option that will only be fully functional when the cost-model
218// and code-generator have been changed to avoid using scalable vector
219// instructions that are not legal in streaming SVE mode.
221 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
222
223static bool isSMEABIRoutineCall(const CallInst &CI,
224 const AArch64TargetLowering &TLI) {
225 const auto *F = CI.getCalledFunction();
226 return F && SMEAttrs(F->getName(), TLI).isSMEABIRoutine();
227}
228
229/// Returns true if the function has explicit operations that can only be
230/// lowered using incompatible instructions for the selected mode. This also
231/// returns true if the function F may use or modify ZA state.
233 const AArch64TargetLowering &TLI) {
234 for (const BasicBlock &BB : *F) {
235 for (const Instruction &I : BB) {
236 // Be conservative for now and assume that any call to inline asm or to
237 // intrinsics could could result in non-streaming ops (e.g. calls to
238 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
239 // all native LLVM instructions can be lowered to compatible instructions.
240 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
241 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
242 isSMEABIRoutineCall(cast<CallInst>(I), TLI)))
243 return true;
244 }
245 }
246 return false;
247}
248
250 StringRef AttributeStr =
251 isMultiversionedFunction(F) ? "fmv-features" : "target-features";
252 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
254 FeatureStr.split(Features, ",");
255 return AArch64::getFMVPriority(Features);
256}
257
259 return F.hasFnAttribute("fmv-features");
260}
261
262const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
263 AArch64::FeatureExecuteOnly,
264};
265
267 const Function *Callee) const {
268 SMECallAttrs CallAttrs(*Caller, *Callee);
269
270 // Never inline a function explicitly marked as being streaming,
271 // into a non-streaming function. Assume it was marked as streaming
272 // for a reason.
273 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
275 return false;
276
277 // When inlining, we should consider the body of the function, not the
278 // interface.
279 if (CallAttrs.callee().hasStreamingBody()) {
280 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
281 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
282 }
283
284 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
285 return false;
286
287 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
288 CallAttrs.requiresPreservingZT0() ||
289 CallAttrs.requiresPreservingAllZAState()) {
290 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
291 return false;
292 }
293
294 const TargetMachine &TM = getTLI()->getTargetMachine();
295 const FeatureBitset &CallerBits =
296 TM.getSubtargetImpl(*Caller)->getFeatureBits();
297 const FeatureBitset &CalleeBits =
298 TM.getSubtargetImpl(*Callee)->getFeatureBits();
299 // Adjust the feature bitsets by inverting some of the bits. This is needed
300 // for target features that represent restrictions rather than capabilities,
301 // for example a "+execute-only" callee can be inlined into a caller without
302 // "+execute-only", but not vice versa.
303 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
304 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
305
306 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
307}
308
310 const Function *Caller, const Function *Callee,
311 const ArrayRef<Type *> &Types) const {
312 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
313 return false;
314
315 // We need to ensure that argument promotion does not attempt to promote
316 // pointers to fixed-length vector types larger than 128 bits like
317 // <8 x float> (and pointers to aggregate types which have such fixed-length
318 // vector type members) into the values of the pointees. Such vector types
319 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
320 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
321 // types can be safely treated as 128-bit NEON types and they cannot be
322 // distinguished in IR.
323 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
324 auto FVTy = dyn_cast<FixedVectorType>(Ty);
325 return FVTy &&
326 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
327 }))
328 return false;
329
330 return true;
331}
332
333unsigned
335 unsigned DefaultCallPenalty) const {
336 // This function calculates a penalty for executing Call in F.
337 //
338 // There are two ways this function can be called:
339 // (1) F:
340 // call from F -> G (the call here is Call)
341 //
342 // For (1), Call.getCaller() == F, so it will always return a high cost if
343 // a streaming-mode change is required (thus promoting the need to inline the
344 // function)
345 //
346 // (2) F:
347 // call from F -> G (the call here is not Call)
348 // G:
349 // call from G -> H (the call here is Call)
350 //
351 // For (2), if after inlining the body of G into F the call to H requires a
352 // streaming-mode change, and the call to G from F would also require a
353 // streaming-mode change, then there is benefit to do the streaming-mode
354 // change only once and avoid inlining of G into F.
355
356 SMEAttrs FAttrs(*F);
357 SMECallAttrs CallAttrs(Call, getTLI());
358
359 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
360 if (F == Call.getCaller()) // (1)
361 return CallPenaltyChangeSM * DefaultCallPenalty;
362 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
363 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
364 }
365
366 return DefaultCallPenalty;
367}
368
373 ST->isNeonAvailable());
374}
375
376/// Calculate the cost of materializing a 64-bit value. This helper
377/// method might only calculate a fraction of a larger immediate. Therefore it
378/// is valid to return a cost of ZERO.
380 // Check if the immediate can be encoded within an instruction.
381 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
382 return 0;
383
384 if (Val < 0)
385 Val = ~Val;
386
387 // Calculate how many moves we will need to materialize this constant.
389 AArch64_IMM::expandMOVImm(Val, 64, Insn);
390 return Insn.size();
391}
392
393/// Calculate the cost of materializing the given constant.
397 assert(Ty->isIntegerTy());
398
399 unsigned BitSize = Ty->getPrimitiveSizeInBits();
400 if (BitSize == 0)
401 return ~0U;
402
403 // Sign-extend all constants to a multiple of 64-bit.
404 APInt ImmVal = Imm;
405 if (BitSize & 0x3f)
406 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
407
408 // Split the constant into 64-bit chunks and calculate the cost for each
409 // chunk.
411 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
412 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
413 int64_t Val = Tmp.getSExtValue();
414 Cost += getIntImmCost(Val);
415 }
416 // We need at least one instruction to materialze the constant.
417 return std::max<InstructionCost>(1, Cost);
418}
419
421 const APInt &Imm, Type *Ty,
423 Instruction *Inst) const {
424 assert(Ty->isIntegerTy());
425
426 unsigned BitSize = Ty->getPrimitiveSizeInBits();
427 // There is no cost model for constants with a bit size of 0. Return TCC_Free
428 // here, so that constant hoisting will ignore this constant.
429 if (BitSize == 0)
430 return TTI::TCC_Free;
431
432 unsigned ImmIdx = ~0U;
433 switch (Opcode) {
434 default:
435 return TTI::TCC_Free;
436 case Instruction::GetElementPtr:
437 // Always hoist the base address of a GetElementPtr.
438 if (Idx == 0)
439 return 2 * TTI::TCC_Basic;
440 return TTI::TCC_Free;
441 case Instruction::Store:
442 ImmIdx = 0;
443 break;
444 case Instruction::Add:
445 case Instruction::Sub:
446 case Instruction::Mul:
447 case Instruction::UDiv:
448 case Instruction::SDiv:
449 case Instruction::URem:
450 case Instruction::SRem:
451 case Instruction::And:
452 case Instruction::Or:
453 case Instruction::Xor:
454 case Instruction::ICmp:
455 ImmIdx = 1;
456 break;
457 // Always return TCC_Free for the shift value of a shift instruction.
458 case Instruction::Shl:
459 case Instruction::LShr:
460 case Instruction::AShr:
461 if (Idx == 1)
462 return TTI::TCC_Free;
463 break;
464 case Instruction::Trunc:
465 case Instruction::ZExt:
466 case Instruction::SExt:
467 case Instruction::IntToPtr:
468 case Instruction::PtrToInt:
469 case Instruction::BitCast:
470 case Instruction::PHI:
471 case Instruction::Call:
472 case Instruction::Select:
473 case Instruction::Ret:
474 case Instruction::Load:
475 break;
476 }
477
478 if (Idx == ImmIdx) {
479 int NumConstants = (BitSize + 63) / 64;
481 return (Cost <= NumConstants * TTI::TCC_Basic)
482 ? static_cast<int>(TTI::TCC_Free)
483 : Cost;
484 }
486}
487
490 const APInt &Imm, Type *Ty,
492 assert(Ty->isIntegerTy());
493
494 unsigned BitSize = Ty->getPrimitiveSizeInBits();
495 // There is no cost model for constants with a bit size of 0. Return TCC_Free
496 // here, so that constant hoisting will ignore this constant.
497 if (BitSize == 0)
498 return TTI::TCC_Free;
499
500 // Most (all?) AArch64 intrinsics do not support folding immediates into the
501 // selected instruction, so we compute the materialization cost for the
502 // immediate directly.
503 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
505
506 switch (IID) {
507 default:
508 return TTI::TCC_Free;
509 case Intrinsic::sadd_with_overflow:
510 case Intrinsic::uadd_with_overflow:
511 case Intrinsic::ssub_with_overflow:
512 case Intrinsic::usub_with_overflow:
513 case Intrinsic::smul_with_overflow:
514 case Intrinsic::umul_with_overflow:
515 if (Idx == 1) {
516 int NumConstants = (BitSize + 63) / 64;
518 return (Cost <= NumConstants * TTI::TCC_Basic)
519 ? static_cast<int>(TTI::TCC_Free)
520 : Cost;
521 }
522 break;
523 case Intrinsic::experimental_stackmap:
524 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
525 return TTI::TCC_Free;
526 break;
527 case Intrinsic::experimental_patchpoint_void:
528 case Intrinsic::experimental_patchpoint:
529 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
530 return TTI::TCC_Free;
531 break;
532 case Intrinsic::experimental_gc_statepoint:
533 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
534 return TTI::TCC_Free;
535 break;
536 }
538}
539
541AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
542 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
543 if (TyWidth == 32 || TyWidth == 64)
545 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
546 return TTI::PSK_Software;
547}
548
549static bool isUnpackedVectorVT(EVT VecVT) {
550 return VecVT.isScalableVector() &&
552}
553
555 const IntrinsicCostAttributes &ICA) {
556 // We need to know at least the number of elements in the vector of buckets
557 // and the size of each element to update.
558 if (ICA.getArgTypes().size() < 2)
560
561 // Only interested in costing for the hardware instruction from SVE2.
562 if (!ST->hasSVE2())
564
565 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
566 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
567 unsigned TotalHistCnts = 1;
568
569 unsigned EltSize = EltTy->getScalarSizeInBits();
570 // Only allow (up to 64b) integers or pointers
571 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
573
574 // FIXME: We should be able to generate histcnt for fixed-length vectors
575 // using ptrue with a specific VL.
576 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
577 unsigned EC = VTy->getElementCount().getKnownMinValue();
578 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
580
581 // HistCnt only supports 32b and 64b element types
582 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
583
584 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
586
587 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
588 TotalHistCnts = EC / NaturalVectorWidth;
589
590 return InstructionCost(BaseHistCntCost * TotalHistCnts);
591 }
592
594}
595
599 // The code-generator is currently not able to handle scalable vectors
600 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
601 // it. This change will be removed when code-generation for these types is
602 // sufficiently reliable.
603 auto *RetTy = ICA.getReturnType();
604 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
605 if (VTy->getElementCount() == ElementCount::getScalable(1))
607
608 switch (ICA.getID()) {
609 case Intrinsic::experimental_vector_histogram_add: {
610 InstructionCost HistCost = getHistogramCost(ST, ICA);
611 // If the cost isn't valid, we may still be able to scalarize
612 if (HistCost.isValid())
613 return HistCost;
614 break;
615 }
616 case Intrinsic::umin:
617 case Intrinsic::umax:
618 case Intrinsic::smin:
619 case Intrinsic::smax: {
620 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
621 MVT::v8i16, MVT::v2i32, MVT::v4i32,
622 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
623 MVT::nxv2i64};
625 // v2i64 types get converted to cmp+bif hence the cost of 2
626 if (LT.second == MVT::v2i64)
627 return LT.first * 2;
628 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
629 return LT.first;
630 break;
631 }
632 case Intrinsic::sadd_sat:
633 case Intrinsic::ssub_sat:
634 case Intrinsic::uadd_sat:
635 case Intrinsic::usub_sat: {
636 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
637 MVT::v8i16, MVT::v2i32, MVT::v4i32,
638 MVT::v2i64};
640 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
641 // need to extend the type, as it uses shr(qadd(shl, shl)).
642 unsigned Instrs =
643 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
644 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
645 return LT.first * Instrs;
646
648 uint64_t VectorSize = TS.getKnownMinValue();
649
650 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
651 return LT.first * Instrs;
652
653 break;
654 }
655 case Intrinsic::abs: {
656 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
657 MVT::v8i16, MVT::v2i32, MVT::v4i32,
658 MVT::v2i64};
660 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
661 return LT.first;
662 break;
663 }
664 case Intrinsic::bswap: {
665 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
666 MVT::v4i32, MVT::v2i64};
668 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
669 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
670 return LT.first;
671 break;
672 }
673 case Intrinsic::fma:
674 case Intrinsic::fmuladd: {
675 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
676 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
677 Type *EltTy = RetTy->getScalarType();
678 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
679 (EltTy->isHalfTy() && ST->hasFullFP16()))
680 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
681 break;
682 }
683 case Intrinsic::stepvector: {
684 InstructionCost Cost = 1; // Cost of the `index' instruction
686 // Legalisation of illegal vectors involves an `index' instruction plus
687 // (LT.first - 1) vector adds.
688 if (LT.first > 1) {
689 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
690 InstructionCost AddCost =
691 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
692 Cost += AddCost * (LT.first - 1);
693 }
694 return Cost;
695 }
696 case Intrinsic::vector_extract:
697 case Intrinsic::vector_insert: {
698 // If both the vector and subvector types are legal types and the index
699 // is 0, then this should be a no-op or simple operation; return a
700 // relatively low cost.
701
702 // If arguments aren't actually supplied, then we cannot determine the
703 // value of the index. We also want to skip predicate types.
704 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
706 break;
707
708 LLVMContext &C = RetTy->getContext();
709 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
710 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
711 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
712 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
713 // Skip this if either the vector or subvector types are unpacked
714 // SVE types; they may get lowered to stack stores and loads.
715 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
716 break;
717
719 getTLI()->getTypeConversion(C, SubVecVT);
721 getTLI()->getTypeConversion(C, VecVT);
722 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
723 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
724 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
725 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
726 return TTI::TCC_Free;
727 break;
728 }
729 case Intrinsic::bitreverse: {
730 static const CostTblEntry BitreverseTbl[] = {
731 {Intrinsic::bitreverse, MVT::i32, 1},
732 {Intrinsic::bitreverse, MVT::i64, 1},
733 {Intrinsic::bitreverse, MVT::v8i8, 1},
734 {Intrinsic::bitreverse, MVT::v16i8, 1},
735 {Intrinsic::bitreverse, MVT::v4i16, 2},
736 {Intrinsic::bitreverse, MVT::v8i16, 2},
737 {Intrinsic::bitreverse, MVT::v2i32, 2},
738 {Intrinsic::bitreverse, MVT::v4i32, 2},
739 {Intrinsic::bitreverse, MVT::v1i64, 2},
740 {Intrinsic::bitreverse, MVT::v2i64, 2},
741 };
742 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
743 const auto *Entry =
744 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
745 if (Entry) {
746 // Cost Model is using the legal type(i32) that i8 and i16 will be
747 // converted to +1 so that we match the actual lowering cost
748 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
749 TLI->getValueType(DL, RetTy, true) == MVT::i16)
750 return LegalisationCost.first * Entry->Cost + 1;
751
752 return LegalisationCost.first * Entry->Cost;
753 }
754 break;
755 }
756 case Intrinsic::ctpop: {
757 if (!ST->hasNEON()) {
758 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
759 return getTypeLegalizationCost(RetTy).first * 12;
760 }
761 static const CostTblEntry CtpopCostTbl[] = {
762 {ISD::CTPOP, MVT::v2i64, 4},
763 {ISD::CTPOP, MVT::v4i32, 3},
764 {ISD::CTPOP, MVT::v8i16, 2},
765 {ISD::CTPOP, MVT::v16i8, 1},
766 {ISD::CTPOP, MVT::i64, 4},
767 {ISD::CTPOP, MVT::v2i32, 3},
768 {ISD::CTPOP, MVT::v4i16, 2},
769 {ISD::CTPOP, MVT::v8i8, 1},
770 {ISD::CTPOP, MVT::i32, 5},
771 };
773 MVT MTy = LT.second;
774 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
775 // Extra cost of +1 when illegal vector types are legalized by promoting
776 // the integer type.
777 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
778 RetTy->getScalarSizeInBits()
779 ? 1
780 : 0;
781 return LT.first * Entry->Cost + ExtraCost;
782 }
783 break;
784 }
785 case Intrinsic::sadd_with_overflow:
786 case Intrinsic::uadd_with_overflow:
787 case Intrinsic::ssub_with_overflow:
788 case Intrinsic::usub_with_overflow:
789 case Intrinsic::smul_with_overflow:
790 case Intrinsic::umul_with_overflow: {
791 static const CostTblEntry WithOverflowCostTbl[] = {
792 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
793 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
794 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
795 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
796 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
797 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
798 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
799 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
800 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
801 {Intrinsic::usub_with_overflow, MVT::i8, 3},
802 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
803 {Intrinsic::usub_with_overflow, MVT::i16, 3},
804 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
805 {Intrinsic::usub_with_overflow, MVT::i32, 1},
806 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
807 {Intrinsic::usub_with_overflow, MVT::i64, 1},
808 {Intrinsic::smul_with_overflow, MVT::i8, 5},
809 {Intrinsic::umul_with_overflow, MVT::i8, 4},
810 {Intrinsic::smul_with_overflow, MVT::i16, 5},
811 {Intrinsic::umul_with_overflow, MVT::i16, 4},
812 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
813 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
814 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
815 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
816 };
817 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
818 if (MTy.isSimple())
819 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
820 MTy.getSimpleVT()))
821 return Entry->Cost;
822 break;
823 }
824 case Intrinsic::fptosi_sat:
825 case Intrinsic::fptoui_sat: {
826 if (ICA.getArgTypes().empty())
827 break;
828 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
829 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
830 EVT MTy = TLI->getValueType(DL, RetTy);
831 // Check for the legal types, which are where the size of the input and the
832 // output are the same, or we are using cvt f64->i32 or f32->i64.
833 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
834 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
835 LT.second == MVT::v2f64)) {
836 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
837 (LT.second == MVT::f64 && MTy == MVT::i32) ||
838 (LT.second == MVT::f32 && MTy == MVT::i64)))
839 return LT.first;
840 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
841 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
842 MTy.getScalarSizeInBits() == 64)
843 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
844 }
845 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
846 // f32.
847 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
848 return LT.first + getIntrinsicInstrCost(
849 {ICA.getID(),
850 RetTy,
851 {ICA.getArgTypes()[0]->getWithNewType(
852 Type::getFloatTy(RetTy->getContext()))}},
853 CostKind);
854 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
855 (LT.second == MVT::f16 && MTy == MVT::i64) ||
856 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
857 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
858 return LT.first;
859 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
860 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
861 MTy.getScalarSizeInBits() == 32)
862 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
863 // Extending vector types v8f16->v8i32. These current scalarize but the
864 // codegen could be better.
865 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
866 MTy.getScalarSizeInBits() == 64)
867 return MTy.getVectorNumElements() * 3;
868
869 // If we can we use a legal convert followed by a min+max
870 if ((LT.second.getScalarType() == MVT::f32 ||
871 LT.second.getScalarType() == MVT::f64 ||
872 LT.second.getScalarType() == MVT::f16) &&
873 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
874 Type *LegalTy =
875 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
876 if (LT.second.isVector())
877 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
879 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
880 LegalTy, {LegalTy, LegalTy});
882 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
883 LegalTy, {LegalTy, LegalTy});
885 return LT.first * Cost +
886 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
887 : 1);
888 }
889 // Otherwise we need to follow the default expansion that clamps the value
890 // using a float min/max with a fcmp+sel for nan handling when signed.
891 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
892 RetTy = RetTy->getScalarType();
893 if (LT.second.isVector()) {
894 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
895 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
896 }
897 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
899 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
901 Cost +=
902 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
904 if (IsSigned) {
905 Type *CondTy = RetTy->getWithNewBitWidth(1);
906 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
908 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
910 }
911 return LT.first * Cost;
912 }
913 case Intrinsic::fshl:
914 case Intrinsic::fshr: {
915 if (ICA.getArgs().empty())
916 break;
917
918 // TODO: Add handling for fshl where third argument is not a constant.
919 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
920 if (!OpInfoZ.isConstant())
921 break;
922
923 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
924 if (OpInfoZ.isUniform()) {
925 static const CostTblEntry FshlTbl[] = {
926 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
927 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
928 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
929 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
930 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
931 // to avoid having to duplicate the costs.
932 const auto *Entry =
933 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
934 if (Entry)
935 return LegalisationCost.first * Entry->Cost;
936 }
937
938 auto TyL = getTypeLegalizationCost(RetTy);
939 if (!RetTy->isIntegerTy())
940 break;
941
942 // Estimate cost manually, as types like i8 and i16 will get promoted to
943 // i32 and CostTableLookup will ignore the extra conversion cost.
944 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
945 RetTy->getScalarSizeInBits() < 64) ||
946 (RetTy->getScalarSizeInBits() % 64 != 0);
947 unsigned ExtraCost = HigherCost ? 1 : 0;
948 if (RetTy->getScalarSizeInBits() == 32 ||
949 RetTy->getScalarSizeInBits() == 64)
950 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
951 // extr instruction.
952 else if (HigherCost)
953 ExtraCost = 1;
954 else
955 break;
956 return TyL.first + ExtraCost;
957 }
958 case Intrinsic::get_active_lane_mask: {
959 auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
960 if (RetTy) {
961 EVT RetVT = getTLI()->getValueType(DL, RetTy);
962 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
963 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
964 !getTLI()->isTypeLegal(RetVT)) {
965 // We don't have enough context at this point to determine if the mask
966 // is going to be kept live after the block, which will force the vXi1
967 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
968 // For now, we just assume the vectorizer created this intrinsic and
969 // the result will be the input for a PHI. In this case the cost will
970 // be extremely high for fixed-width vectors.
971 // NOTE: getScalarizationOverhead returns a cost that's far too
972 // pessimistic for the actual generated codegen. In reality there are
973 // two instructions generated per lane.
974 return RetTy->getNumElements() * 2;
975 }
976 }
977 break;
978 }
979 case Intrinsic::experimental_vector_match: {
980 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
981 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
982 unsigned SearchSize = NeedleTy->getNumElements();
983 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
984 // Base cost for MATCH instructions. At least on the Neoverse V2 and
985 // Neoverse V3, these are cheap operations with the same latency as a
986 // vector ADD. In most cases, however, we also need to do an extra DUP.
987 // For fixed-length vectors we currently need an extra five--six
988 // instructions besides the MATCH.
990 if (isa<FixedVectorType>(RetTy))
991 Cost += 10;
992 return Cost;
993 }
994 break;
995 }
996 case Intrinsic::experimental_cttz_elts: {
997 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
998 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
999 // This will consist of a SVE brkb and a cntp instruction. These
1000 // typically have the same latency and half the throughput as a vector
1001 // add instruction.
1002 return 4;
1003 }
1004 break;
1005 }
1006 default:
1007 break;
1008 }
1010}
1011
1012/// The function will remove redundant reinterprets casting in the presence
1013/// of the control flow
1014static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1015 IntrinsicInst &II) {
1017 auto RequiredType = II.getType();
1018
1019 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1020 assert(PN && "Expected Phi Node!");
1021
1022 // Don't create a new Phi unless we can remove the old one.
1023 if (!PN->hasOneUse())
1024 return std::nullopt;
1025
1026 for (Value *IncValPhi : PN->incoming_values()) {
1027 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1028 if (!Reinterpret ||
1029 Reinterpret->getIntrinsicID() !=
1030 Intrinsic::aarch64_sve_convert_to_svbool ||
1031 RequiredType != Reinterpret->getArgOperand(0)->getType())
1032 return std::nullopt;
1033 }
1034
1035 // Create the new Phi
1036 IC.Builder.SetInsertPoint(PN);
1037 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1038 Worklist.push_back(PN);
1039
1040 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1041 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1042 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1043 Worklist.push_back(Reinterpret);
1044 }
1045
1046 // Cleanup Phi Node and reinterprets
1047 return IC.replaceInstUsesWith(II, NPN);
1048}
1049
1050// A collection of properties common to SVE intrinsics that allow for combines
1051// to be written without needing to know the specific intrinsic.
1053 //
1054 // Helper routines for common intrinsic definitions.
1055 //
1056
1057 // e.g. llvm.aarch64.sve.add pg, op1, op2
1058 // with IID ==> llvm.aarch64.sve.add_u
1059 static SVEIntrinsicInfo
1061 return SVEIntrinsicInfo()
1065 }
1066
1067 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1069 return SVEIntrinsicInfo()
1073 }
1074
1075 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1077 return SVEIntrinsicInfo()
1080 }
1081
1082 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1084 return SVEIntrinsicInfo()
1087 }
1088
1089 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1090 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1091 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1092 return SVEIntrinsicInfo()
1095 }
1096
1097 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1098 // llvm.aarch64.sve.ld1 pg, ptr
1100 return SVEIntrinsicInfo()
1104 }
1105
1106 // All properties relate to predication and thus having a general predicate
1107 // is the minimum requirement to say there is intrinsic info to act on.
1108 explicit operator bool() const { return hasGoverningPredicate(); }
1109
1110 //
1111 // Properties relating to the governing predicate.
1112 //
1113
1115 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1116 }
1117
1119 assert(hasGoverningPredicate() && "Propery not set!");
1120 return GoverningPredicateIdx;
1121 }
1122
1124 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1125 GoverningPredicateIdx = Index;
1126 return *this;
1127 }
1128
1129 //
1130 // Properties relating to operations the intrinsic could be transformed into.
1131 // NOTE: This does not mean such a transformation is always possible, but the
1132 // knowledge makes it possible to reuse existing optimisations without needing
1133 // to embed specific handling for each intrinsic. For example, instruction
1134 // simplification can be used to optimise an intrinsic's active lanes.
1135 //
1136
1138 return UndefIntrinsic != Intrinsic::not_intrinsic;
1139 }
1140
1142 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1143 return UndefIntrinsic;
1144 }
1145
1147 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1148 UndefIntrinsic = IID;
1149 return *this;
1150 }
1151
1152 bool hasMatchingIROpode() const { return IROpcode != 0; }
1153
1154 unsigned getMatchingIROpode() const {
1155 assert(hasMatchingIROpode() && "Propery not set!");
1156 return IROpcode;
1157 }
1158
1160 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1161 IROpcode = Opcode;
1162 return *this;
1163 }
1164
1165 //
1166 // Properties relating to the result of inactive lanes.
1167 //
1168
1170 return ResultLanes == InactiveLanesTakenFromOperand;
1171 }
1172
1174 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1175 return OperandIdxForInactiveLanes;
1176 }
1177
1179 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1180 ResultLanes = InactiveLanesTakenFromOperand;
1181 OperandIdxForInactiveLanes = Index;
1182 return *this;
1183 }
1184
1186 return ResultLanes == InactiveLanesAreNotDefined;
1187 }
1188
1190 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1191 ResultLanes = InactiveLanesAreNotDefined;
1192 return *this;
1193 }
1194
1196 return ResultLanes == InactiveLanesAreUnused;
1197 }
1198
1200 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1201 ResultLanes = InactiveLanesAreUnused;
1202 return *this;
1203 }
1204
1205 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1206 // inactiveLanesAreZeroed =
1207 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1208 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1209
1211 ResultIsZeroInitialized = true;
1212 return *this;
1213 }
1214
1215 //
1216 // The first operand of unary merging operations is typically only used to
1217 // set the result for inactive lanes. Knowing this allows us to deadcode the
1218 // operand when we can prove there are no inactive lanes.
1219 //
1220
1222 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1223 }
1224
1226 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1227 return OperandIdxWithNoActiveLanes;
1228 }
1229
1231 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1232 OperandIdxWithNoActiveLanes = Index;
1233 return *this;
1234 }
1235
1236private:
1237 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1238
1239 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1240 unsigned IROpcode = 0;
1241
1242 enum PredicationStyle {
1244 InactiveLanesTakenFromOperand,
1245 InactiveLanesAreNotDefined,
1246 InactiveLanesAreUnused
1247 } ResultLanes = Uninitialized;
1248
1249 bool ResultIsZeroInitialized = false;
1250 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1251 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1252};
1253
1255 // Some SVE intrinsics do not use scalable vector types, but since they are
1256 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1257 if (!isa<ScalableVectorType>(II.getType()) &&
1258 all_of(II.args(), [&](const Value *V) {
1259 return !isa<ScalableVectorType>(V->getType());
1260 }))
1261 return SVEIntrinsicInfo();
1262
1263 Intrinsic::ID IID = II.getIntrinsicID();
1264 switch (IID) {
1265 default:
1266 break;
1267 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1268 case Intrinsic::aarch64_sve_fcvt_f16f32:
1269 case Intrinsic::aarch64_sve_fcvt_f16f64:
1270 case Intrinsic::aarch64_sve_fcvt_f32f16:
1271 case Intrinsic::aarch64_sve_fcvt_f32f64:
1272 case Intrinsic::aarch64_sve_fcvt_f64f16:
1273 case Intrinsic::aarch64_sve_fcvt_f64f32:
1274 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1275 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1276 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1277 case Intrinsic::aarch64_sve_fcvtzs:
1278 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1279 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1280 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1281 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1282 case Intrinsic::aarch64_sve_fcvtzu:
1283 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1284 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1285 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1286 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1287 case Intrinsic::aarch64_sve_scvtf:
1288 case Intrinsic::aarch64_sve_scvtf_f16i32:
1289 case Intrinsic::aarch64_sve_scvtf_f16i64:
1290 case Intrinsic::aarch64_sve_scvtf_f32i64:
1291 case Intrinsic::aarch64_sve_scvtf_f64i32:
1292 case Intrinsic::aarch64_sve_ucvtf:
1293 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1294 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1295 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1296 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1298
1299 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1300 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1301 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1302 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1304
1305 case Intrinsic::aarch64_sve_fabd:
1306 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1307 case Intrinsic::aarch64_sve_fadd:
1308 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1309 .setMatchingIROpcode(Instruction::FAdd);
1310 case Intrinsic::aarch64_sve_fdiv:
1311 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1312 .setMatchingIROpcode(Instruction::FDiv);
1313 case Intrinsic::aarch64_sve_fmax:
1314 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1315 case Intrinsic::aarch64_sve_fmaxnm:
1316 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1317 case Intrinsic::aarch64_sve_fmin:
1318 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1319 case Intrinsic::aarch64_sve_fminnm:
1320 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1321 case Intrinsic::aarch64_sve_fmla:
1322 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1323 case Intrinsic::aarch64_sve_fmls:
1324 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1325 case Intrinsic::aarch64_sve_fmul:
1326 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1327 .setMatchingIROpcode(Instruction::FMul);
1328 case Intrinsic::aarch64_sve_fmulx:
1329 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1330 case Intrinsic::aarch64_sve_fnmla:
1331 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1332 case Intrinsic::aarch64_sve_fnmls:
1333 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1334 case Intrinsic::aarch64_sve_fsub:
1335 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1336 .setMatchingIROpcode(Instruction::FSub);
1337 case Intrinsic::aarch64_sve_add:
1338 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1339 .setMatchingIROpcode(Instruction::Add);
1340 case Intrinsic::aarch64_sve_mla:
1341 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1342 case Intrinsic::aarch64_sve_mls:
1343 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1344 case Intrinsic::aarch64_sve_mul:
1345 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1346 .setMatchingIROpcode(Instruction::Mul);
1347 case Intrinsic::aarch64_sve_sabd:
1348 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1349 case Intrinsic::aarch64_sve_sdiv:
1350 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1351 .setMatchingIROpcode(Instruction::SDiv);
1352 case Intrinsic::aarch64_sve_smax:
1353 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1354 case Intrinsic::aarch64_sve_smin:
1355 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1356 case Intrinsic::aarch64_sve_smulh:
1357 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1358 case Intrinsic::aarch64_sve_sub:
1359 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1360 .setMatchingIROpcode(Instruction::Sub);
1361 case Intrinsic::aarch64_sve_uabd:
1362 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1363 case Intrinsic::aarch64_sve_udiv:
1364 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1365 .setMatchingIROpcode(Instruction::UDiv);
1366 case Intrinsic::aarch64_sve_umax:
1367 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1368 case Intrinsic::aarch64_sve_umin:
1369 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1370 case Intrinsic::aarch64_sve_umulh:
1371 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1372 case Intrinsic::aarch64_sve_asr:
1373 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1374 .setMatchingIROpcode(Instruction::AShr);
1375 case Intrinsic::aarch64_sve_lsl:
1376 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1377 .setMatchingIROpcode(Instruction::Shl);
1378 case Intrinsic::aarch64_sve_lsr:
1379 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1380 .setMatchingIROpcode(Instruction::LShr);
1381 case Intrinsic::aarch64_sve_and:
1382 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1383 .setMatchingIROpcode(Instruction::And);
1384 case Intrinsic::aarch64_sve_bic:
1385 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1386 case Intrinsic::aarch64_sve_eor:
1387 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1388 .setMatchingIROpcode(Instruction::Xor);
1389 case Intrinsic::aarch64_sve_orr:
1390 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1391 .setMatchingIROpcode(Instruction::Or);
1392 case Intrinsic::aarch64_sve_sqsub:
1393 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1394 case Intrinsic::aarch64_sve_uqsub:
1395 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1396
1397 case Intrinsic::aarch64_sve_add_u:
1399 Instruction::Add);
1400 case Intrinsic::aarch64_sve_and_u:
1402 Instruction::And);
1403 case Intrinsic::aarch64_sve_asr_u:
1405 Instruction::AShr);
1406 case Intrinsic::aarch64_sve_eor_u:
1408 Instruction::Xor);
1409 case Intrinsic::aarch64_sve_fadd_u:
1411 Instruction::FAdd);
1412 case Intrinsic::aarch64_sve_fdiv_u:
1414 Instruction::FDiv);
1415 case Intrinsic::aarch64_sve_fmul_u:
1417 Instruction::FMul);
1418 case Intrinsic::aarch64_sve_fsub_u:
1420 Instruction::FSub);
1421 case Intrinsic::aarch64_sve_lsl_u:
1423 Instruction::Shl);
1424 case Intrinsic::aarch64_sve_lsr_u:
1426 Instruction::LShr);
1427 case Intrinsic::aarch64_sve_mul_u:
1429 Instruction::Mul);
1430 case Intrinsic::aarch64_sve_orr_u:
1432 Instruction::Or);
1433 case Intrinsic::aarch64_sve_sdiv_u:
1435 Instruction::SDiv);
1436 case Intrinsic::aarch64_sve_sub_u:
1438 Instruction::Sub);
1439 case Intrinsic::aarch64_sve_udiv_u:
1441 Instruction::UDiv);
1442
1443 case Intrinsic::aarch64_sve_addqv:
1444 case Intrinsic::aarch64_sve_and_z:
1445 case Intrinsic::aarch64_sve_bic_z:
1446 case Intrinsic::aarch64_sve_brka_z:
1447 case Intrinsic::aarch64_sve_brkb_z:
1448 case Intrinsic::aarch64_sve_brkn_z:
1449 case Intrinsic::aarch64_sve_brkpa_z:
1450 case Intrinsic::aarch64_sve_brkpb_z:
1451 case Intrinsic::aarch64_sve_cntp:
1452 case Intrinsic::aarch64_sve_compact:
1453 case Intrinsic::aarch64_sve_eor_z:
1454 case Intrinsic::aarch64_sve_eorv:
1455 case Intrinsic::aarch64_sve_eorqv:
1456 case Intrinsic::aarch64_sve_nand_z:
1457 case Intrinsic::aarch64_sve_nor_z:
1458 case Intrinsic::aarch64_sve_orn_z:
1459 case Intrinsic::aarch64_sve_orr_z:
1460 case Intrinsic::aarch64_sve_orv:
1461 case Intrinsic::aarch64_sve_orqv:
1462 case Intrinsic::aarch64_sve_pnext:
1463 case Intrinsic::aarch64_sve_rdffr_z:
1464 case Intrinsic::aarch64_sve_saddv:
1465 case Intrinsic::aarch64_sve_uaddv:
1466 case Intrinsic::aarch64_sve_umaxv:
1467 case Intrinsic::aarch64_sve_umaxqv:
1468 case Intrinsic::aarch64_sve_cmpeq:
1469 case Intrinsic::aarch64_sve_cmpeq_wide:
1470 case Intrinsic::aarch64_sve_cmpge:
1471 case Intrinsic::aarch64_sve_cmpge_wide:
1472 case Intrinsic::aarch64_sve_cmpgt:
1473 case Intrinsic::aarch64_sve_cmpgt_wide:
1474 case Intrinsic::aarch64_sve_cmphi:
1475 case Intrinsic::aarch64_sve_cmphi_wide:
1476 case Intrinsic::aarch64_sve_cmphs:
1477 case Intrinsic::aarch64_sve_cmphs_wide:
1478 case Intrinsic::aarch64_sve_cmple_wide:
1479 case Intrinsic::aarch64_sve_cmplo_wide:
1480 case Intrinsic::aarch64_sve_cmpls_wide:
1481 case Intrinsic::aarch64_sve_cmplt_wide:
1482 case Intrinsic::aarch64_sve_cmpne:
1483 case Intrinsic::aarch64_sve_cmpne_wide:
1484 case Intrinsic::aarch64_sve_facge:
1485 case Intrinsic::aarch64_sve_facgt:
1486 case Intrinsic::aarch64_sve_fcmpeq:
1487 case Intrinsic::aarch64_sve_fcmpge:
1488 case Intrinsic::aarch64_sve_fcmpgt:
1489 case Intrinsic::aarch64_sve_fcmpne:
1490 case Intrinsic::aarch64_sve_fcmpuo:
1491 case Intrinsic::aarch64_sve_ld1:
1492 case Intrinsic::aarch64_sve_ld1_gather:
1493 case Intrinsic::aarch64_sve_ld1_gather_index:
1494 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1495 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1496 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1497 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1498 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1499 case Intrinsic::aarch64_sve_ld1q_gather_index:
1500 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1501 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1502 case Intrinsic::aarch64_sve_ld1ro:
1503 case Intrinsic::aarch64_sve_ld1rq:
1504 case Intrinsic::aarch64_sve_ld1udq:
1505 case Intrinsic::aarch64_sve_ld1uwq:
1506 case Intrinsic::aarch64_sve_ld2_sret:
1507 case Intrinsic::aarch64_sve_ld2q_sret:
1508 case Intrinsic::aarch64_sve_ld3_sret:
1509 case Intrinsic::aarch64_sve_ld3q_sret:
1510 case Intrinsic::aarch64_sve_ld4_sret:
1511 case Intrinsic::aarch64_sve_ld4q_sret:
1512 case Intrinsic::aarch64_sve_ldff1:
1513 case Intrinsic::aarch64_sve_ldff1_gather:
1514 case Intrinsic::aarch64_sve_ldff1_gather_index:
1515 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1516 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1517 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1518 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1519 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1520 case Intrinsic::aarch64_sve_ldnf1:
1521 case Intrinsic::aarch64_sve_ldnt1:
1522 case Intrinsic::aarch64_sve_ldnt1_gather:
1523 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1524 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1525 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1527
1528 case Intrinsic::aarch64_sve_prf:
1529 case Intrinsic::aarch64_sve_prfb_gather_index:
1530 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1531 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1532 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1533 case Intrinsic::aarch64_sve_prfd_gather_index:
1534 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1535 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1536 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1537 case Intrinsic::aarch64_sve_prfh_gather_index:
1538 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1539 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1540 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1541 case Intrinsic::aarch64_sve_prfw_gather_index:
1542 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1543 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1544 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1546
1547 case Intrinsic::aarch64_sve_st1_scatter:
1548 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1549 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1550 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1551 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1552 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1553 case Intrinsic::aarch64_sve_st1dq:
1554 case Intrinsic::aarch64_sve_st1q_scatter_index:
1555 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1556 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1557 case Intrinsic::aarch64_sve_st1wq:
1558 case Intrinsic::aarch64_sve_stnt1:
1559 case Intrinsic::aarch64_sve_stnt1_scatter:
1560 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1561 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1562 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1564 case Intrinsic::aarch64_sve_st2:
1565 case Intrinsic::aarch64_sve_st2q:
1567 case Intrinsic::aarch64_sve_st3:
1568 case Intrinsic::aarch64_sve_st3q:
1570 case Intrinsic::aarch64_sve_st4:
1571 case Intrinsic::aarch64_sve_st4q:
1573 }
1574
1575 return SVEIntrinsicInfo();
1576}
1577
1578static bool isAllActivePredicate(Value *Pred) {
1579 // Look through convert.from.svbool(convert.to.svbool(...) chain.
1580 Value *UncastedPred;
1581 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1582 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1583 m_Value(UncastedPred)))))
1584 // If the predicate has the same or less lanes than the uncasted
1585 // predicate then we know the casting has no effect.
1586 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1587 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1588 Pred = UncastedPred;
1589 auto *C = dyn_cast<Constant>(Pred);
1590 return (C && C->isAllOnesValue());
1591}
1592
1593// Simplify `V` by only considering the operations that affect active lanes.
1594// This function should only return existing Values or newly created Constants.
1595static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1596 auto *Dup = dyn_cast<IntrinsicInst>(V);
1597 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1598 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1600 cast<VectorType>(V->getType())->getElementCount(),
1601 cast<Constant>(Dup->getOperand(2)));
1602
1603 return V;
1604}
1605
1606static std::optional<Instruction *>
1608 const SVEIntrinsicInfo &IInfo) {
1609 const unsigned Opc = IInfo.getMatchingIROpode();
1610 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1611
1612 Value *Pg = II.getOperand(0);
1613 Value *Op1 = II.getOperand(1);
1614 Value *Op2 = II.getOperand(2);
1615 const DataLayout &DL = II.getDataLayout();
1616
1617 // Canonicalise constants to the RHS.
1619 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1620 IC.replaceOperand(II, 1, Op2);
1621 IC.replaceOperand(II, 2, Op1);
1622 return &II;
1623 }
1624
1625 // Only active lanes matter when simplifying the operation.
1626 Op1 = stripInactiveLanes(Op1, Pg);
1627 Op2 = stripInactiveLanes(Op2, Pg);
1628
1629 Value *SimpleII;
1630 if (auto FII = dyn_cast<FPMathOperator>(&II))
1631 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1632 else
1633 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1634
1635 // An SVE intrinsic's result is always defined. However, this is not the case
1636 // for its equivalent IR instruction (e.g. when shifting by an amount more
1637 // than the data's bitwidth). Simplifications to an undefined result must be
1638 // ignored to preserve the intrinsic's expected behaviour.
1639 if (!SimpleII || isa<UndefValue>(SimpleII))
1640 return std::nullopt;
1641
1642 if (IInfo.inactiveLanesAreNotDefined())
1643 return IC.replaceInstUsesWith(II, SimpleII);
1644
1645 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1646
1647 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1648 if (SimpleII == Inactive)
1649 return IC.replaceInstUsesWith(II, SimpleII);
1650
1651 // Inactive lanes must be preserved.
1652 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1653 return IC.replaceInstUsesWith(II, SimpleII);
1654}
1655
1656// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1657// to operations with less strict inactive lane requirements.
1658static std::optional<Instruction *>
1660 const SVEIntrinsicInfo &IInfo) {
1661 if (!IInfo.hasGoverningPredicate())
1662 return std::nullopt;
1663
1664 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1665
1666 // If there are no active lanes.
1667 if (match(OpPredicate, m_ZeroInt())) {
1669 return IC.replaceInstUsesWith(
1670 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1671
1672 if (IInfo.inactiveLanesAreUnused()) {
1673 if (IInfo.resultIsZeroInitialized())
1675
1676 return IC.eraseInstFromFunction(II);
1677 }
1678 }
1679
1680 // If there are no inactive lanes.
1681 if (isAllActivePredicate(OpPredicate)) {
1682 if (IInfo.hasOperandWithNoActiveLanes()) {
1683 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1684 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1685 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1686 }
1687
1688 if (IInfo.hasMatchingUndefIntrinsic()) {
1689 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1690 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1691 II.setCalledFunction(NewDecl);
1692 return &II;
1693 }
1694 }
1695
1696 // Operation specific simplifications.
1697 if (IInfo.hasMatchingIROpode() &&
1699 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1700
1701 return std::nullopt;
1702}
1703
1704// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1705// => (binop (pred) (from_svbool _) (from_svbool _))
1706//
1707// The above transformation eliminates a `to_svbool` in the predicate
1708// operand of bitwise operation `binop` by narrowing the vector width of
1709// the operation. For example, it would convert a `<vscale x 16 x i1>
1710// and` into a `<vscale x 4 x i1> and`. This is profitable because
1711// to_svbool must zero the new lanes during widening, whereas
1712// from_svbool is free.
1713static std::optional<Instruction *>
1715 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1716 if (!BinOp)
1717 return std::nullopt;
1718
1719 auto IntrinsicID = BinOp->getIntrinsicID();
1720 switch (IntrinsicID) {
1721 case Intrinsic::aarch64_sve_and_z:
1722 case Intrinsic::aarch64_sve_bic_z:
1723 case Intrinsic::aarch64_sve_eor_z:
1724 case Intrinsic::aarch64_sve_nand_z:
1725 case Intrinsic::aarch64_sve_nor_z:
1726 case Intrinsic::aarch64_sve_orn_z:
1727 case Intrinsic::aarch64_sve_orr_z:
1728 break;
1729 default:
1730 return std::nullopt;
1731 }
1732
1733 auto BinOpPred = BinOp->getOperand(0);
1734 auto BinOpOp1 = BinOp->getOperand(1);
1735 auto BinOpOp2 = BinOp->getOperand(2);
1736
1737 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1738 if (!PredIntr ||
1739 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1740 return std::nullopt;
1741
1742 auto PredOp = PredIntr->getOperand(0);
1743 auto PredOpTy = cast<VectorType>(PredOp->getType());
1744 if (PredOpTy != II.getType())
1745 return std::nullopt;
1746
1747 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1748 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1749 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1750 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1751 if (BinOpOp1 == BinOpOp2)
1752 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1753 else
1754 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1755 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1756
1757 auto NarrowedBinOp =
1758 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1759 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1760}
1761
1762static std::optional<Instruction *>
1764 // If the reinterpret instruction operand is a PHI Node
1765 if (isa<PHINode>(II.getArgOperand(0)))
1766 return processPhiNode(IC, II);
1767
1768 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1769 return BinOpCombine;
1770
1771 // Ignore converts to/from svcount_t.
1772 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1773 isa<TargetExtType>(II.getType()))
1774 return std::nullopt;
1775
1776 SmallVector<Instruction *, 32> CandidatesForRemoval;
1777 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1778
1779 const auto *IVTy = cast<VectorType>(II.getType());
1780
1781 // Walk the chain of conversions.
1782 while (Cursor) {
1783 // If the type of the cursor has fewer lanes than the final result, zeroing
1784 // must take place, which breaks the equivalence chain.
1785 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1786 if (CursorVTy->getElementCount().getKnownMinValue() <
1787 IVTy->getElementCount().getKnownMinValue())
1788 break;
1789
1790 // If the cursor has the same type as I, it is a viable replacement.
1791 if (Cursor->getType() == IVTy)
1792 EarliestReplacement = Cursor;
1793
1794 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1795
1796 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1797 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1798 Intrinsic::aarch64_sve_convert_to_svbool ||
1799 IntrinsicCursor->getIntrinsicID() ==
1800 Intrinsic::aarch64_sve_convert_from_svbool))
1801 break;
1802
1803 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1804 Cursor = IntrinsicCursor->getOperand(0);
1805 }
1806
1807 // If no viable replacement in the conversion chain was found, there is
1808 // nothing to do.
1809 if (!EarliestReplacement)
1810 return std::nullopt;
1811
1812 return IC.replaceInstUsesWith(II, EarliestReplacement);
1813}
1814
1815static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1816 IntrinsicInst &II) {
1817 // svsel(ptrue, x, y) => x
1818 auto *OpPredicate = II.getOperand(0);
1819 if (isAllActivePredicate(OpPredicate))
1820 return IC.replaceInstUsesWith(II, II.getOperand(1));
1821
1822 auto Select =
1823 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1824 return IC.replaceInstUsesWith(II, Select);
1825}
1826
1827static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1828 IntrinsicInst &II) {
1829 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1830 if (!Pg)
1831 return std::nullopt;
1832
1833 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1834 return std::nullopt;
1835
1836 const auto PTruePattern =
1837 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1838 if (PTruePattern != AArch64SVEPredPattern::vl1)
1839 return std::nullopt;
1840
1841 // The intrinsic is inserting into lane zero so use an insert instead.
1842 auto *IdxTy = Type::getInt64Ty(II.getContext());
1843 auto *Insert = InsertElementInst::Create(
1844 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1845 Insert->insertBefore(II.getIterator());
1846 Insert->takeName(&II);
1847
1848 return IC.replaceInstUsesWith(II, Insert);
1849}
1850
1851static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1852 IntrinsicInst &II) {
1853 // Replace DupX with a regular IR splat.
1854 auto *RetTy = cast<ScalableVectorType>(II.getType());
1855 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1856 II.getArgOperand(0));
1857 Splat->takeName(&II);
1858 return IC.replaceInstUsesWith(II, Splat);
1859}
1860
1861static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1862 IntrinsicInst &II) {
1863 LLVMContext &Ctx = II.getContext();
1864
1865 if (!isAllActivePredicate(II.getArgOperand(0)))
1866 return std::nullopt;
1867
1868 // Check that we have a compare of zero..
1869 auto *SplatValue =
1870 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
1871 if (!SplatValue || !SplatValue->isZero())
1872 return std::nullopt;
1873
1874 // ..against a dupq
1875 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1876 if (!DupQLane ||
1877 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1878 return std::nullopt;
1879
1880 // Where the dupq is a lane 0 replicate of a vector insert
1881 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1882 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1883 return std::nullopt;
1884
1885 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1886 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1887 return std::nullopt;
1888
1889 // Where the vector insert is a fixed constant vector insert into undef at
1890 // index zero
1891 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1892 return std::nullopt;
1893
1894 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1895 return std::nullopt;
1896
1897 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1898 if (!ConstVec)
1899 return std::nullopt;
1900
1901 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1902 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1903 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1904 return std::nullopt;
1905
1906 unsigned NumElts = VecTy->getNumElements();
1907 unsigned PredicateBits = 0;
1908
1909 // Expand intrinsic operands to a 16-bit byte level predicate
1910 for (unsigned I = 0; I < NumElts; ++I) {
1911 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1912 if (!Arg)
1913 return std::nullopt;
1914 if (!Arg->isZero())
1915 PredicateBits |= 1 << (I * (16 / NumElts));
1916 }
1917
1918 // If all bits are zero bail early with an empty predicate
1919 if (PredicateBits == 0) {
1920 auto *PFalse = Constant::getNullValue(II.getType());
1921 PFalse->takeName(&II);
1922 return IC.replaceInstUsesWith(II, PFalse);
1923 }
1924
1925 // Calculate largest predicate type used (where byte predicate is largest)
1926 unsigned Mask = 8;
1927 for (unsigned I = 0; I < 16; ++I)
1928 if ((PredicateBits & (1 << I)) != 0)
1929 Mask |= (I % 8);
1930
1931 unsigned PredSize = Mask & -Mask;
1932 auto *PredType = ScalableVectorType::get(
1933 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1934
1935 // Ensure all relevant bits are set
1936 for (unsigned I = 0; I < 16; I += PredSize)
1937 if ((PredicateBits & (1 << I)) == 0)
1938 return std::nullopt;
1939
1940 auto *PTruePat =
1941 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1942 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1943 {PredType}, {PTruePat});
1944 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1945 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1946 auto *ConvertFromSVBool =
1947 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1948 {II.getType()}, {ConvertToSVBool});
1949
1950 ConvertFromSVBool->takeName(&II);
1951 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1952}
1953
1954static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1955 IntrinsicInst &II) {
1956 Value *Pg = II.getArgOperand(0);
1957 Value *Vec = II.getArgOperand(1);
1958 auto IntrinsicID = II.getIntrinsicID();
1959 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1960
1961 // lastX(splat(X)) --> X
1962 if (auto *SplatVal = getSplatValue(Vec))
1963 return IC.replaceInstUsesWith(II, SplatVal);
1964
1965 // If x and/or y is a splat value then:
1966 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1967 Value *LHS, *RHS;
1968 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1969 if (isSplatValue(LHS) || isSplatValue(RHS)) {
1970 auto *OldBinOp = cast<BinaryOperator>(Vec);
1971 auto OpC = OldBinOp->getOpcode();
1972 auto *NewLHS =
1973 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1974 auto *NewRHS =
1975 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1977 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
1978 return IC.replaceInstUsesWith(II, NewBinOp);
1979 }
1980 }
1981
1982 auto *C = dyn_cast<Constant>(Pg);
1983 if (IsAfter && C && C->isNullValue()) {
1984 // The intrinsic is extracting lane 0 so use an extract instead.
1985 auto *IdxTy = Type::getInt64Ty(II.getContext());
1986 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
1987 Extract->insertBefore(II.getIterator());
1988 Extract->takeName(&II);
1989 return IC.replaceInstUsesWith(II, Extract);
1990 }
1991
1992 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1993 if (!IntrPG)
1994 return std::nullopt;
1995
1996 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1997 return std::nullopt;
1998
1999 const auto PTruePattern =
2000 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2001
2002 // Can the intrinsic's predicate be converted to a known constant index?
2003 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2004 if (!MinNumElts)
2005 return std::nullopt;
2006
2007 unsigned Idx = MinNumElts - 1;
2008 // Increment the index if extracting the element after the last active
2009 // predicate element.
2010 if (IsAfter)
2011 ++Idx;
2012
2013 // Ignore extracts whose index is larger than the known minimum vector
2014 // length. NOTE: This is an artificial constraint where we prefer to
2015 // maintain what the user asked for until an alternative is proven faster.
2016 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2017 if (Idx >= PgVTy->getMinNumElements())
2018 return std::nullopt;
2019
2020 // The intrinsic is extracting a fixed lane so use an extract instead.
2021 auto *IdxTy = Type::getInt64Ty(II.getContext());
2022 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2023 Extract->insertBefore(II.getIterator());
2024 Extract->takeName(&II);
2025 return IC.replaceInstUsesWith(II, Extract);
2026}
2027
2028static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2029 IntrinsicInst &II) {
2030 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2031 // integer variant across a variety of micro-architectures. Replace scalar
2032 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2033 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2034 // depending on the micro-architecture, but has been observed as generally
2035 // being faster, particularly when the CLAST[AB] op is a loop-carried
2036 // dependency.
2037 Value *Pg = II.getArgOperand(0);
2038 Value *Fallback = II.getArgOperand(1);
2039 Value *Vec = II.getArgOperand(2);
2040 Type *Ty = II.getType();
2041
2042 if (!Ty->isIntegerTy())
2043 return std::nullopt;
2044
2045 Type *FPTy;
2046 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2047 default:
2048 return std::nullopt;
2049 case 16:
2050 FPTy = IC.Builder.getHalfTy();
2051 break;
2052 case 32:
2053 FPTy = IC.Builder.getFloatTy();
2054 break;
2055 case 64:
2056 FPTy = IC.Builder.getDoubleTy();
2057 break;
2058 }
2059
2060 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2061 auto *FPVTy = VectorType::get(
2062 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2063 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2064 auto *FPII = IC.Builder.CreateIntrinsic(
2065 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2066 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2067 return IC.replaceInstUsesWith(II, FPIItoInt);
2068}
2069
2070static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2071 IntrinsicInst &II) {
2072 LLVMContext &Ctx = II.getContext();
2073 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2074 // can work with RDFFR_PP for ptest elimination.
2075 auto *AllPat =
2076 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2077 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2078 {II.getType()}, {AllPat});
2079 auto *RDFFR =
2080 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});
2081 RDFFR->takeName(&II);
2082 return IC.replaceInstUsesWith(II, RDFFR);
2083}
2084
2085static std::optional<Instruction *>
2087 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2088
2089 if (Pattern == AArch64SVEPredPattern::all) {
2091 II.getType(), ElementCount::getScalable(NumElts));
2092 Cnt->takeName(&II);
2093 return IC.replaceInstUsesWith(II, Cnt);
2094 }
2095
2096 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2097
2098 return MinNumElts && NumElts >= MinNumElts
2099 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2100 II, ConstantInt::get(II.getType(), MinNumElts)))
2101 : std::nullopt;
2102}
2103
2104static std::optional<Instruction *>
2106 const AArch64Subtarget *ST) {
2107 if (!ST->isStreaming())
2108 return std::nullopt;
2109
2110 // In streaming-mode, aarch64_sme_cnts is equivalent to aarch64_sve_cnt
2111 // with SVEPredPattern::all
2113 II.getType(), ElementCount::getScalable(NumElts));
2114 Cnt->takeName(&II);
2115 return IC.replaceInstUsesWith(II, Cnt);
2116}
2117
2118static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2119 IntrinsicInst &II) {
2120 Value *PgVal = II.getArgOperand(0);
2121 Value *OpVal = II.getArgOperand(1);
2122
2123 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2124 // Later optimizations prefer this form.
2125 if (PgVal == OpVal &&
2126 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2127 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2128 Value *Ops[] = {PgVal, OpVal};
2129 Type *Tys[] = {PgVal->getType()};
2130
2131 auto *PTest =
2132 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2133 PTest->takeName(&II);
2134
2135 return IC.replaceInstUsesWith(II, PTest);
2136 }
2137
2138 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
2139 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
2140
2141 if (!Pg || !Op)
2142 return std::nullopt;
2143
2144 Intrinsic::ID OpIID = Op->getIntrinsicID();
2145
2146 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2147 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2148 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2149 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2150 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2151
2152 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2153
2154 PTest->takeName(&II);
2155 return IC.replaceInstUsesWith(II, PTest);
2156 }
2157
2158 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2159 // Later optimizations may rewrite sequence to use the flag-setting variant
2160 // of instruction X to remove PTEST.
2161 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2162 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2163 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2164 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2165 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2166 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2167 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2168 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2169 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2170 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2171 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2172 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2173 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2174 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2175 Type *Tys[] = {Pg->getType()};
2176
2177 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2178 PTest->takeName(&II);
2179
2180 return IC.replaceInstUsesWith(II, PTest);
2181 }
2182
2183 return std::nullopt;
2184}
2185
2186template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
2187static std::optional<Instruction *>
2189 bool MergeIntoAddendOp) {
2190 Value *P = II.getOperand(0);
2191 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2192 if (MergeIntoAddendOp) {
2193 AddendOp = II.getOperand(1);
2194 Mul = II.getOperand(2);
2195 } else {
2196 AddendOp = II.getOperand(2);
2197 Mul = II.getOperand(1);
2198 }
2199
2200 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
2201 m_Value(MulOp1))))
2202 return std::nullopt;
2203
2204 if (!Mul->hasOneUse())
2205 return std::nullopt;
2206
2207 Instruction *FMFSource = nullptr;
2208 if (II.getType()->isFPOrFPVectorTy()) {
2209 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2210 // Stop the combine when the flags on the inputs differ in case dropping
2211 // flags would lead to us missing out on more beneficial optimizations.
2212 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2213 return std::nullopt;
2214 if (!FAddFlags.allowContract())
2215 return std::nullopt;
2216 FMFSource = &II;
2217 }
2218
2219 CallInst *Res;
2220 if (MergeIntoAddendOp)
2221 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2222 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2223 else
2224 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2225 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2226
2227 return IC.replaceInstUsesWith(II, Res);
2228}
2229
2230static std::optional<Instruction *>
2232 Value *Pred = II.getOperand(0);
2233 Value *PtrOp = II.getOperand(1);
2234 Type *VecTy = II.getType();
2235
2236 if (isAllActivePredicate(Pred)) {
2237 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2238 Load->copyMetadata(II);
2239 return IC.replaceInstUsesWith(II, Load);
2240 }
2241
2242 CallInst *MaskedLoad =
2243 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2244 Pred, ConstantAggregateZero::get(VecTy));
2245 MaskedLoad->copyMetadata(II);
2246 return IC.replaceInstUsesWith(II, MaskedLoad);
2247}
2248
2249static std::optional<Instruction *>
2251 Value *VecOp = II.getOperand(0);
2252 Value *Pred = II.getOperand(1);
2253 Value *PtrOp = II.getOperand(2);
2254
2255 if (isAllActivePredicate(Pred)) {
2256 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2257 Store->copyMetadata(II);
2258 return IC.eraseInstFromFunction(II);
2259 }
2260
2261 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2262 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2263 MaskedStore->copyMetadata(II);
2264 return IC.eraseInstFromFunction(II);
2265}
2266
2268 switch (Intrinsic) {
2269 case Intrinsic::aarch64_sve_fmul_u:
2270 return Instruction::BinaryOps::FMul;
2271 case Intrinsic::aarch64_sve_fadd_u:
2272 return Instruction::BinaryOps::FAdd;
2273 case Intrinsic::aarch64_sve_fsub_u:
2274 return Instruction::BinaryOps::FSub;
2275 default:
2276 return Instruction::BinaryOpsEnd;
2277 }
2278}
2279
2280static std::optional<Instruction *>
2282 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2283 if (II.isStrictFP())
2284 return std::nullopt;
2285
2286 auto *OpPredicate = II.getOperand(0);
2287 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2288 if (BinOpCode == Instruction::BinaryOpsEnd ||
2289 !isAllActivePredicate(OpPredicate))
2290 return std::nullopt;
2291 auto BinOp = IC.Builder.CreateBinOpFMF(
2292 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2293 return IC.replaceInstUsesWith(II, BinOp);
2294}
2295
2296static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2297 IntrinsicInst &II) {
2298 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2299 Intrinsic::aarch64_sve_mla>(
2300 IC, II, true))
2301 return MLA;
2302 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2303 Intrinsic::aarch64_sve_mad>(
2304 IC, II, false))
2305 return MAD;
2306 return std::nullopt;
2307}
2308
2309static std::optional<Instruction *>
2311 if (auto FMLA =
2312 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2313 Intrinsic::aarch64_sve_fmla>(IC, II,
2314 true))
2315 return FMLA;
2316 if (auto FMAD =
2317 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2318 Intrinsic::aarch64_sve_fmad>(IC, II,
2319 false))
2320 return FMAD;
2321 if (auto FMLA =
2322 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2323 Intrinsic::aarch64_sve_fmla>(IC, II,
2324 true))
2325 return FMLA;
2326 return std::nullopt;
2327}
2328
2329static std::optional<Instruction *>
2331 if (auto FMLA =
2332 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2333 Intrinsic::aarch64_sve_fmla>(IC, II,
2334 true))
2335 return FMLA;
2336 if (auto FMAD =
2337 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2338 Intrinsic::aarch64_sve_fmad>(IC, II,
2339 false))
2340 return FMAD;
2341 if (auto FMLA_U =
2342 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2343 Intrinsic::aarch64_sve_fmla_u>(
2344 IC, II, true))
2345 return FMLA_U;
2346 return instCombineSVEVectorBinOp(IC, II);
2347}
2348
2349static std::optional<Instruction *>
2351 if (auto FMLS =
2352 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2353 Intrinsic::aarch64_sve_fmls>(IC, II,
2354 true))
2355 return FMLS;
2356 if (auto FMSB =
2357 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2358 Intrinsic::aarch64_sve_fnmsb>(
2359 IC, II, false))
2360 return FMSB;
2361 if (auto FMLS =
2362 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2363 Intrinsic::aarch64_sve_fmls>(IC, II,
2364 true))
2365 return FMLS;
2366 return std::nullopt;
2367}
2368
2369static std::optional<Instruction *>
2371 if (auto FMLS =
2372 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2373 Intrinsic::aarch64_sve_fmls>(IC, II,
2374 true))
2375 return FMLS;
2376 if (auto FMSB =
2377 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2378 Intrinsic::aarch64_sve_fnmsb>(
2379 IC, II, false))
2380 return FMSB;
2381 if (auto FMLS_U =
2382 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2383 Intrinsic::aarch64_sve_fmls_u>(
2384 IC, II, true))
2385 return FMLS_U;
2386 return instCombineSVEVectorBinOp(IC, II);
2387}
2388
2389static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2390 IntrinsicInst &II) {
2391 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2392 Intrinsic::aarch64_sve_mls>(
2393 IC, II, true))
2394 return MLS;
2395 return std::nullopt;
2396}
2397
2398static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2399 IntrinsicInst &II) {
2400 Value *UnpackArg = II.getArgOperand(0);
2401 auto *RetTy = cast<ScalableVectorType>(II.getType());
2402 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2403 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2404
2405 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2406 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2407 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2408 ScalarArg =
2409 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2410 Value *NewVal =
2411 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2412 NewVal->takeName(&II);
2413 return IC.replaceInstUsesWith(II, NewVal);
2414 }
2415
2416 return std::nullopt;
2417}
2418static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2419 IntrinsicInst &II) {
2420 auto *OpVal = II.getOperand(0);
2421 auto *OpIndices = II.getOperand(1);
2422 VectorType *VTy = cast<VectorType>(II.getType());
2423
2424 // Check whether OpIndices is a constant splat value < minimal element count
2425 // of result.
2426 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2427 if (!SplatValue ||
2428 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2429 return std::nullopt;
2430
2431 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2432 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2433 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2434 auto *VectorSplat =
2435 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2436
2437 VectorSplat->takeName(&II);
2438 return IC.replaceInstUsesWith(II, VectorSplat);
2439}
2440
2441static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2442 IntrinsicInst &II) {
2443 Value *A, *B;
2444 Type *RetTy = II.getType();
2445 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2446 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2447
2448 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2449 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2450 if ((match(II.getArgOperand(0),
2451 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) &&
2452 match(II.getArgOperand(1),
2453 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) ||
2454 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2455 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2456 auto *TyA = cast<ScalableVectorType>(A->getType());
2457 if (TyA == B->getType() &&
2459 auto *SubVec = IC.Builder.CreateInsertVector(
2461 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2462 TyA->getMinNumElements());
2463 ConcatVec->takeName(&II);
2464 return IC.replaceInstUsesWith(II, ConcatVec);
2465 }
2466 }
2467
2468 return std::nullopt;
2469}
2470
2471static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2472 IntrinsicInst &II) {
2473 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2474 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2475 Value *A, *B;
2476 if (match(II.getArgOperand(0),
2477 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
2478 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
2479 m_Specific(A), m_Specific(B))))
2480 return IC.replaceInstUsesWith(
2481 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2482
2483 return std::nullopt;
2484}
2485
2486static std::optional<Instruction *>
2488 Value *Mask = II.getOperand(0);
2489 Value *BasePtr = II.getOperand(1);
2490 Value *Index = II.getOperand(2);
2491 Type *Ty = II.getType();
2492 Value *PassThru = ConstantAggregateZero::get(Ty);
2493
2494 // Contiguous gather => masked load.
2495 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2496 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2497 Value *IndexBase;
2498 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
2499 m_Value(IndexBase), m_SpecificInt(1)))) {
2500 Align Alignment =
2501 BasePtr->getPointerAlignment(II.getDataLayout());
2502
2503 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2504 BasePtr, IndexBase);
2505 CallInst *MaskedLoad =
2506 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2507 MaskedLoad->takeName(&II);
2508 return IC.replaceInstUsesWith(II, MaskedLoad);
2509 }
2510
2511 return std::nullopt;
2512}
2513
2514static std::optional<Instruction *>
2516 Value *Val = II.getOperand(0);
2517 Value *Mask = II.getOperand(1);
2518 Value *BasePtr = II.getOperand(2);
2519 Value *Index = II.getOperand(3);
2520 Type *Ty = Val->getType();
2521
2522 // Contiguous scatter => masked store.
2523 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2524 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2525 Value *IndexBase;
2526 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
2527 m_Value(IndexBase), m_SpecificInt(1)))) {
2528 Align Alignment =
2529 BasePtr->getPointerAlignment(II.getDataLayout());
2530
2531 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2532 BasePtr, IndexBase);
2533 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2534
2535 return IC.eraseInstFromFunction(II);
2536 }
2537
2538 return std::nullopt;
2539}
2540
2541static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2542 IntrinsicInst &II) {
2543 Type *Int32Ty = IC.Builder.getInt32Ty();
2544 Value *Pred = II.getOperand(0);
2545 Value *Vec = II.getOperand(1);
2546 Value *DivVec = II.getOperand(2);
2547
2548 Value *SplatValue = getSplatValue(DivVec);
2549 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2550 if (!SplatConstantInt)
2551 return std::nullopt;
2552
2553 APInt Divisor = SplatConstantInt->getValue();
2554 const int64_t DivisorValue = Divisor.getSExtValue();
2555 if (DivisorValue == -1)
2556 return std::nullopt;
2557 if (DivisorValue == 1)
2558 IC.replaceInstUsesWith(II, Vec);
2559
2560 if (Divisor.isPowerOf2()) {
2561 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2562 auto ASRD = IC.Builder.CreateIntrinsic(
2563 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2564 return IC.replaceInstUsesWith(II, ASRD);
2565 }
2566 if (Divisor.isNegatedPowerOf2()) {
2567 Divisor.negate();
2568 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2569 auto ASRD = IC.Builder.CreateIntrinsic(
2570 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2571 auto NEG = IC.Builder.CreateIntrinsic(
2572 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2573 return IC.replaceInstUsesWith(II, NEG);
2574 }
2575
2576 return std::nullopt;
2577}
2578
2579bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2580 size_t VecSize = Vec.size();
2581 if (VecSize == 1)
2582 return true;
2583 if (!isPowerOf2_64(VecSize))
2584 return false;
2585 size_t HalfVecSize = VecSize / 2;
2586
2587 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2588 RHS != Vec.end(); LHS++, RHS++) {
2589 if (*LHS != nullptr && *RHS != nullptr) {
2590 if (*LHS == *RHS)
2591 continue;
2592 else
2593 return false;
2594 }
2595 if (!AllowPoison)
2596 return false;
2597 if (*LHS == nullptr && *RHS != nullptr)
2598 *LHS = *RHS;
2599 }
2600
2601 Vec.resize(HalfVecSize);
2602 SimplifyValuePattern(Vec, AllowPoison);
2603 return true;
2604}
2605
2606// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2607// to dupqlane(f64(C)) where C is A concatenated with B
2608static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2609 IntrinsicInst &II) {
2610 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2611 if (!match(II.getOperand(0),
2612 m_Intrinsic<Intrinsic::vector_insert>(
2613 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2614 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2615 return std::nullopt;
2616 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2617
2618 // Insert the scalars into a container ordered by InsertElement index
2619 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2620 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2621 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2622 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2623 CurrentInsertElt = InsertElt->getOperand(0);
2624 }
2625
2626 bool AllowPoison =
2627 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2628 if (!SimplifyValuePattern(Elts, AllowPoison))
2629 return std::nullopt;
2630
2631 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2632 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2633 for (size_t I = 0; I < Elts.size(); I++) {
2634 if (Elts[I] == nullptr)
2635 continue;
2636 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2637 IC.Builder.getInt64(I));
2638 }
2639 if (InsertEltChain == nullptr)
2640 return std::nullopt;
2641
2642 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2643 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2644 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2645 // be narrowed back to the original type.
2646 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2647 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2648 IIScalableTy->getMinNumElements() /
2649 PatternWidth;
2650
2651 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2652 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2653 auto *WideShuffleMaskTy =
2654 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2655
2656 auto InsertSubvector = IC.Builder.CreateInsertVector(
2657 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2658 uint64_t(0));
2659 auto WideBitcast =
2660 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2661 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2662 auto WideShuffle = IC.Builder.CreateShuffleVector(
2663 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2664 auto NarrowBitcast =
2665 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2666
2667 return IC.replaceInstUsesWith(II, NarrowBitcast);
2668}
2669
2670static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2671 IntrinsicInst &II) {
2672 Value *A = II.getArgOperand(0);
2673 Value *B = II.getArgOperand(1);
2674 if (A == B)
2675 return IC.replaceInstUsesWith(II, A);
2676
2677 return std::nullopt;
2678}
2679
2680static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2681 IntrinsicInst &II) {
2682 Value *Pred = II.getOperand(0);
2683 Value *Vec = II.getOperand(1);
2684 Value *Shift = II.getOperand(2);
2685
2686 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2687 Value *AbsPred, *MergedValue;
2688 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2689 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2690 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2691 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2692
2693 return std::nullopt;
2694
2695 // Transform is valid if any of the following are true:
2696 // * The ABS merge value is an undef or non-negative
2697 // * The ABS predicate is all active
2698 // * The ABS predicate and the SRSHL predicates are the same
2699 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2700 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2701 return std::nullopt;
2702
2703 // Only valid when the shift amount is non-negative, otherwise the rounding
2704 // behaviour of SRSHL cannot be ignored.
2705 if (!match(Shift, m_NonNegative()))
2706 return std::nullopt;
2707
2708 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2709 {II.getType()}, {Pred, Vec, Shift});
2710
2711 return IC.replaceInstUsesWith(II, LSL);
2712}
2713
2714static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2715 IntrinsicInst &II) {
2716 Value *Vec = II.getOperand(0);
2717
2718 if (getSplatValue(Vec) == II.getOperand(1))
2719 return IC.replaceInstUsesWith(II, Vec);
2720
2721 return std::nullopt;
2722}
2723
2724static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2725 IntrinsicInst &II) {
2726 // If this barrier is post-dominated by identical one we can remove it
2727 auto *NI = II.getNextNode();
2728 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2729 auto CanSkipOver = [](Instruction *I) {
2730 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2731 };
2732 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2733 auto *NIBB = NI->getParent();
2734 NI = NI->getNextNode();
2735 if (!NI) {
2736 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2737 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2738 else
2739 break;
2740 }
2741 }
2742 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2743 if (NextII && II.isIdenticalTo(NextII))
2744 return IC.eraseInstFromFunction(II);
2745
2746 return std::nullopt;
2747}
2748
2749static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2750 IntrinsicInst &II) {
2751 if (match(II.getOperand(0), m_ConstantInt<AArch64SVEPredPattern::all>()))
2752 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2753 return std::nullopt;
2754}
2755
2756static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2758 unsigned NumBits) {
2759 Value *Passthru = II.getOperand(0);
2760 Value *Pg = II.getOperand(1);
2761 Value *Op = II.getOperand(2);
2762
2763 // Convert UXT[BHW] to AND.
2764 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2765 auto *Ty = cast<VectorType>(II.getType());
2766 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2767 auto *Mask = ConstantInt::get(Ty, MaskValue);
2768 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2769 {Pg, Op, Mask});
2770 return IC.replaceInstUsesWith(II, And);
2771 }
2772
2773 return std::nullopt;
2774}
2775
2776static std::optional<Instruction *>
2778 SMEAttrs FnSMEAttrs(*II.getFunction());
2779 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2780 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
2781 return IC.replaceInstUsesWith(
2782 II, ConstantInt::getBool(II.getType(), IsStreaming));
2783 return std::nullopt;
2784}
2785
2786std::optional<Instruction *>
2788 IntrinsicInst &II) const {
2790 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2791 return I;
2792
2793 Intrinsic::ID IID = II.getIntrinsicID();
2794 switch (IID) {
2795 default:
2796 break;
2797 case Intrinsic::aarch64_dmb:
2798 return instCombineDMB(IC, II);
2799 case Intrinsic::aarch64_neon_fmaxnm:
2800 case Intrinsic::aarch64_neon_fminnm:
2801 return instCombineMaxMinNM(IC, II);
2802 case Intrinsic::aarch64_sve_convert_from_svbool:
2803 return instCombineConvertFromSVBool(IC, II);
2804 case Intrinsic::aarch64_sve_dup:
2805 return instCombineSVEDup(IC, II);
2806 case Intrinsic::aarch64_sve_dup_x:
2807 return instCombineSVEDupX(IC, II);
2808 case Intrinsic::aarch64_sve_cmpne:
2809 case Intrinsic::aarch64_sve_cmpne_wide:
2810 return instCombineSVECmpNE(IC, II);
2811 case Intrinsic::aarch64_sve_rdffr:
2812 return instCombineRDFFR(IC, II);
2813 case Intrinsic::aarch64_sve_lasta:
2814 case Intrinsic::aarch64_sve_lastb:
2815 return instCombineSVELast(IC, II);
2816 case Intrinsic::aarch64_sve_clasta_n:
2817 case Intrinsic::aarch64_sve_clastb_n:
2818 return instCombineSVECondLast(IC, II);
2819 case Intrinsic::aarch64_sve_cntd:
2820 return instCombineSVECntElts(IC, II, 2);
2821 case Intrinsic::aarch64_sve_cntw:
2822 return instCombineSVECntElts(IC, II, 4);
2823 case Intrinsic::aarch64_sve_cnth:
2824 return instCombineSVECntElts(IC, II, 8);
2825 case Intrinsic::aarch64_sve_cntb:
2826 return instCombineSVECntElts(IC, II, 16);
2827 case Intrinsic::aarch64_sme_cntsd:
2828 return instCombineSMECntsElts(IC, II, 2, ST);
2829 case Intrinsic::aarch64_sme_cntsw:
2830 return instCombineSMECntsElts(IC, II, 4, ST);
2831 case Intrinsic::aarch64_sme_cntsh:
2832 return instCombineSMECntsElts(IC, II, 8, ST);
2833 case Intrinsic::aarch64_sme_cntsb:
2834 return instCombineSMECntsElts(IC, II, 16, ST);
2835 case Intrinsic::aarch64_sve_ptest_any:
2836 case Intrinsic::aarch64_sve_ptest_first:
2837 case Intrinsic::aarch64_sve_ptest_last:
2838 return instCombineSVEPTest(IC, II);
2839 case Intrinsic::aarch64_sve_fadd:
2840 return instCombineSVEVectorFAdd(IC, II);
2841 case Intrinsic::aarch64_sve_fadd_u:
2842 return instCombineSVEVectorFAddU(IC, II);
2843 case Intrinsic::aarch64_sve_fmul_u:
2844 return instCombineSVEVectorBinOp(IC, II);
2845 case Intrinsic::aarch64_sve_fsub:
2846 return instCombineSVEVectorFSub(IC, II);
2847 case Intrinsic::aarch64_sve_fsub_u:
2848 return instCombineSVEVectorFSubU(IC, II);
2849 case Intrinsic::aarch64_sve_add:
2850 return instCombineSVEVectorAdd(IC, II);
2851 case Intrinsic::aarch64_sve_add_u:
2852 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2853 Intrinsic::aarch64_sve_mla_u>(
2854 IC, II, true);
2855 case Intrinsic::aarch64_sve_sub:
2856 return instCombineSVEVectorSub(IC, II);
2857 case Intrinsic::aarch64_sve_sub_u:
2858 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2859 Intrinsic::aarch64_sve_mls_u>(
2860 IC, II, true);
2861 case Intrinsic::aarch64_sve_tbl:
2862 return instCombineSVETBL(IC, II);
2863 case Intrinsic::aarch64_sve_uunpkhi:
2864 case Intrinsic::aarch64_sve_uunpklo:
2865 case Intrinsic::aarch64_sve_sunpkhi:
2866 case Intrinsic::aarch64_sve_sunpklo:
2867 return instCombineSVEUnpack(IC, II);
2868 case Intrinsic::aarch64_sve_uzp1:
2869 return instCombineSVEUzp1(IC, II);
2870 case Intrinsic::aarch64_sve_zip1:
2871 case Intrinsic::aarch64_sve_zip2:
2872 return instCombineSVEZip(IC, II);
2873 case Intrinsic::aarch64_sve_ld1_gather_index:
2874 return instCombineLD1GatherIndex(IC, II);
2875 case Intrinsic::aarch64_sve_st1_scatter_index:
2876 return instCombineST1ScatterIndex(IC, II);
2877 case Intrinsic::aarch64_sve_ld1:
2878 return instCombineSVELD1(IC, II, DL);
2879 case Intrinsic::aarch64_sve_st1:
2880 return instCombineSVEST1(IC, II, DL);
2881 case Intrinsic::aarch64_sve_sdiv:
2882 return instCombineSVESDIV(IC, II);
2883 case Intrinsic::aarch64_sve_sel:
2884 return instCombineSVESel(IC, II);
2885 case Intrinsic::aarch64_sve_srshl:
2886 return instCombineSVESrshl(IC, II);
2887 case Intrinsic::aarch64_sve_dupq_lane:
2888 return instCombineSVEDupqLane(IC, II);
2889 case Intrinsic::aarch64_sve_insr:
2890 return instCombineSVEInsr(IC, II);
2891 case Intrinsic::aarch64_sve_ptrue:
2892 return instCombinePTrue(IC, II);
2893 case Intrinsic::aarch64_sve_uxtb:
2894 return instCombineSVEUxt(IC, II, 8);
2895 case Intrinsic::aarch64_sve_uxth:
2896 return instCombineSVEUxt(IC, II, 16);
2897 case Intrinsic::aarch64_sve_uxtw:
2898 return instCombineSVEUxt(IC, II, 32);
2899 case Intrinsic::aarch64_sme_in_streaming_mode:
2900 return instCombineInStreamingMode(IC, II);
2901 }
2902
2903 return std::nullopt;
2904}
2905
2907 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2908 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2909 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2910 SimplifyAndSetOp) const {
2911 switch (II.getIntrinsicID()) {
2912 default:
2913 break;
2914 case Intrinsic::aarch64_neon_fcvtxn:
2915 case Intrinsic::aarch64_neon_rshrn:
2916 case Intrinsic::aarch64_neon_sqrshrn:
2917 case Intrinsic::aarch64_neon_sqrshrun:
2918 case Intrinsic::aarch64_neon_sqshrn:
2919 case Intrinsic::aarch64_neon_sqshrun:
2920 case Intrinsic::aarch64_neon_sqxtn:
2921 case Intrinsic::aarch64_neon_sqxtun:
2922 case Intrinsic::aarch64_neon_uqrshrn:
2923 case Intrinsic::aarch64_neon_uqshrn:
2924 case Intrinsic::aarch64_neon_uqxtn:
2925 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2926 break;
2927 }
2928
2929 return std::nullopt;
2930}
2931
2933 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2935}
2936
2939 switch (K) {
2941 return TypeSize::getFixed(64);
2943 if (ST->useSVEForFixedLengthVectors() &&
2945 return TypeSize::getFixed(
2946 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2947 else if (ST->isNeonAvailable())
2948 return TypeSize::getFixed(128);
2949 else
2950 return TypeSize::getFixed(0);
2952 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2954 return TypeSize::getScalable(128);
2955 else
2956 return TypeSize::getScalable(0);
2957 }
2958 llvm_unreachable("Unsupported register kind");
2959}
2960
2961bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
2963 Type *SrcOverrideTy) const {
2964 // A helper that returns a vector type from the given type. The number of
2965 // elements in type Ty determines the vector width.
2966 auto toVectorTy = [&](Type *ArgTy) {
2967 return VectorType::get(ArgTy->getScalarType(),
2968 cast<VectorType>(DstTy)->getElementCount());
2969 };
2970
2971 // Exit early if DstTy is not a vector type whose elements are one of [i16,
2972 // i32, i64]. SVE doesn't generally have the same set of instructions to
2973 // perform an extend with the add/sub/mul. There are SMULLB style
2974 // instructions, but they operate on top/bottom, requiring some sort of lane
2975 // interleaving to be used with zext/sext.
2976 unsigned DstEltSize = DstTy->getScalarSizeInBits();
2977 if (!useNeonVector(DstTy) || Args.size() != 2 ||
2978 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2979 return false;
2980
2981 // Determine if the operation has a widening variant. We consider both the
2982 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2983 // instructions.
2984 //
2985 // TODO: Add additional widening operations (e.g., shl, etc.) once we
2986 // verify that their extending operands are eliminated during code
2987 // generation.
2988 Type *SrcTy = SrcOverrideTy;
2989 switch (Opcode) {
2990 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2991 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2992 // The second operand needs to be an extend
2993 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2994 if (!SrcTy)
2995 SrcTy =
2996 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
2997 } else
2998 return false;
2999 break;
3000 case Instruction::Mul: { // SMULL(2), UMULL(2)
3001 // Both operands need to be extends of the same type.
3002 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3003 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3004 if (!SrcTy)
3005 SrcTy =
3006 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3007 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
3008 // If one of the operands is a Zext and the other has enough zero bits to
3009 // be treated as unsigned, we can still general a umull, meaning the zext
3010 // is free.
3011 KnownBits Known =
3012 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3013 if (Args[0]->getType()->getScalarSizeInBits() -
3014 Known.Zero.countLeadingOnes() >
3015 DstTy->getScalarSizeInBits() / 2)
3016 return false;
3017 if (!SrcTy)
3018 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
3019 DstTy->getScalarSizeInBits() / 2));
3020 } else
3021 return false;
3022 break;
3023 }
3024 default:
3025 return false;
3026 }
3027
3028 // Legalize the destination type and ensure it can be used in a widening
3029 // operation.
3030 auto DstTyL = getTypeLegalizationCost(DstTy);
3031 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3032 return false;
3033
3034 // Legalize the source type and ensure it can be used in a widening
3035 // operation.
3036 assert(SrcTy && "Expected some SrcTy");
3037 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3038 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3039 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3040 return false;
3041
3042 // Get the total number of vector elements in the legalized types.
3043 InstructionCost NumDstEls =
3044 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3045 InstructionCost NumSrcEls =
3046 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3047
3048 // Return true if the legalized types have the same number of vector elements
3049 // and the destination element type size is twice that of the source type.
3050 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3051}
3052
3053// s/urhadd instructions implement the following pattern, making the
3054// extends free:
3055// %x = add ((zext i8 -> i16), 1)
3056// %y = (zext i8 -> i16)
3057// trunc i16 (lshr (add %x, %y), 1) -> i8
3058//
3060 Type *Src) const {
3061 // The source should be a legal vector type.
3062 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3063 (Src->isScalableTy() && !ST->hasSVE2()))
3064 return false;
3065
3066 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3067 return false;
3068
3069 // Look for trunc/shl/add before trying to match the pattern.
3070 const Instruction *Add = ExtUser;
3071 auto *AddUser =
3072 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3073 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3074 Add = AddUser;
3075
3076 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3077 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3078 return false;
3079
3080 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3081 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3082 Src->getScalarSizeInBits() !=
3083 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3084 return false;
3085
3086 // Try to match the whole pattern. Ext could be either the first or second
3087 // m_ZExtOrSExt matched.
3088 Instruction *Ex1, *Ex2;
3089 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3090 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
3091 return false;
3092
3093 // Ensure both extends are of the same type
3094 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3095 Ex1->getOpcode() == Ex2->getOpcode())
3096 return true;
3097
3098 return false;
3099}
3100
3102 Type *Src,
3105 const Instruction *I) const {
3106 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3107 assert(ISD && "Invalid opcode");
3108 // If the cast is observable, and it is used by a widening instruction (e.g.,
3109 // uaddl, saddw, etc.), it may be free.
3110 if (I && I->hasOneUser()) {
3111 auto *SingleUser = cast<Instruction>(*I->user_begin());
3112 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3113 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
3114 // For adds only count the second operand as free if both operands are
3115 // extends but not the same operation. (i.e both operands are not free in
3116 // add(sext, zext)).
3117 if (SingleUser->getOpcode() == Instruction::Add) {
3118 if (I == SingleUser->getOperand(1) ||
3119 (isa<CastInst>(SingleUser->getOperand(1)) &&
3120 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3121 return 0;
3122 } else // Others are free so long as isWideningInstruction returned true.
3123 return 0;
3124 }
3125
3126 // The cast will be free for the s/urhadd instructions
3127 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3128 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3129 return 0;
3130 }
3131
3132 // TODO: Allow non-throughput costs that aren't binary.
3133 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
3135 return Cost == 0 ? 0 : 1;
3136 return Cost;
3137 };
3138
3139 EVT SrcTy = TLI->getValueType(DL, Src);
3140 EVT DstTy = TLI->getValueType(DL, Dst);
3141
3142 if (!SrcTy.isSimple() || !DstTy.isSimple())
3143 return AdjustCost(
3144 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3145
3146 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3147 // we use fcvtx under SVE2. Give them invalid costs.
3148 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3149 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3150 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3152
3153 static const TypeConversionCostTblEntry BF16Tbl[] = {
3154 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3155 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3156 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3157 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3158 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3159 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3160 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3161 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3162 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3163 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3164 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3165 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3166 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3167 };
3168
3169 if (ST->hasBF16())
3170 if (const auto *Entry = ConvertCostTableLookup(
3171 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3172 return AdjustCost(Entry->Cost);
3173
3174 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3175 // The cost of unpacking twice is artificially increased for now in order
3176 // to avoid regressions against NEON, which will use tbl instructions directly
3177 // instead of multiple layers of [s|u]unpk[lo|hi].
3178 // We use the unpacks in cases where the destination type is illegal and
3179 // requires splitting of the input, even if the input type itself is legal.
3180 const unsigned int SVE_EXT_COST = 1;
3181 const unsigned int SVE_FCVT_COST = 1;
3182 const unsigned int SVE_UNPACK_ONCE = 4;
3183 const unsigned int SVE_UNPACK_TWICE = 16;
3184
3185 static const TypeConversionCostTblEntry ConversionTbl[] = {
3186 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3187 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3188 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3189 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3190 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3191 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3192 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3193 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3194 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3195 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3196 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3197 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3198 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3199 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3200 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3201 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3202 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3203 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3204 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3205 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3206
3207 // Truncations on nxvmiN
3208 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3209 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3210 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3211 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3212 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3213 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3214 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3215 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3216 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3217 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3218 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3219 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3220 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3221 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3222 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3223 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3224 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3225 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3226 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3227 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3228 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3229 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3230 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3231 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3232 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3233 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3234 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3235 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3236 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3237 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3238 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3239 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3240 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3241
3242 // The number of shll instructions for the extension.
3243 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3244 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3245 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3246 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3247 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3248 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3249 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3250 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3251 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3252 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3253 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3254 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3255 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3256 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3257 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3258 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3259
3260 // FP Ext and trunc
3261 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3262 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3263 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3264 // FP16
3265 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3266 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3267 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3268 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3269 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3270 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3271 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3272 // BF16 (uses shift)
3273 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3274 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3275 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3276 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3277 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3278 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3279 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3280 // FP Ext and trunc
3281 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3282 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3283 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3284 // FP16
3285 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3286 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3287 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3288 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3289 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3290 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3291 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3292 // BF16 (more complex, with +bf16 is handled above)
3293 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3294 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3295 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3296 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3297 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3298 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3299 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3300 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3301
3302 // LowerVectorINT_TO_FP:
3303 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3304 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3305 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3306 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3307 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3308 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3309
3310 // SVE: to nxv2f16
3311 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3312 SVE_EXT_COST + SVE_FCVT_COST},
3313 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3314 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3315 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3316 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3317 SVE_EXT_COST + SVE_FCVT_COST},
3318 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3319 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3320 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3321
3322 // SVE: to nxv4f16
3323 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3324 SVE_EXT_COST + SVE_FCVT_COST},
3325 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3326 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3327 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3328 SVE_EXT_COST + SVE_FCVT_COST},
3329 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3330 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3331
3332 // SVE: to nxv8f16
3333 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3334 SVE_EXT_COST + SVE_FCVT_COST},
3335 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3336 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3337 SVE_EXT_COST + SVE_FCVT_COST},
3338 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3339
3340 // SVE: to nxv16f16
3341 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3342 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3343 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3344 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3345
3346 // Complex: to v2f32
3347 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3348 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3349 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3350 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3351
3352 // SVE: to nxv2f32
3353 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3354 SVE_EXT_COST + SVE_FCVT_COST},
3355 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3356 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3357 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3358 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3359 SVE_EXT_COST + SVE_FCVT_COST},
3360 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3361 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3362 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3363
3364 // Complex: to v4f32
3365 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3366 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3367 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3368 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3369
3370 // SVE: to nxv4f32
3371 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3372 SVE_EXT_COST + SVE_FCVT_COST},
3373 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3374 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3375 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3376 SVE_EXT_COST + SVE_FCVT_COST},
3377 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3378 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3379
3380 // Complex: to v8f32
3381 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3382 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3383 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3384 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3385
3386 // SVE: to nxv8f32
3387 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3388 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3389 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3390 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3391 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3392 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3393 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3394 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3395
3396 // SVE: to nxv16f32
3397 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3398 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3399 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3400 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3401
3402 // Complex: to v16f32
3403 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3404 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3405
3406 // Complex: to v2f64
3407 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3408 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3409 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3410 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3411 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3412 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3413
3414 // SVE: to nxv2f64
3415 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3416 SVE_EXT_COST + SVE_FCVT_COST},
3417 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3418 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3419 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3420 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3421 SVE_EXT_COST + SVE_FCVT_COST},
3422 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3423 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3424 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3425
3426 // Complex: to v4f64
3427 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3428 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3429
3430 // SVE: to nxv4f64
3431 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3432 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3433 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3434 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3435 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3436 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3437 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3438 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3439 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3440 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3441 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3442 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3443
3444 // SVE: to nxv8f64
3445 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3446 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3447 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3448 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3449 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3450 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3451 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3452 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3453
3454 // LowerVectorFP_TO_INT
3455 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3456 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3457 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3458 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3459 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3460 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3461
3462 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3463 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3464 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3465 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3466 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3467 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3468 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3469
3470 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3471 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3472 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3473 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3474 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3475
3476 // Complex, from nxv2f32.
3477 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3478 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3479 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3480 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3481 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3482 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3483 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3484 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3485
3486 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3487 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3488 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3489 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3490 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3491 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3492 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3493
3494 // Complex, from nxv2f64.
3495 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3496 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3497 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3498 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3499 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3500 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3501 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3502 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3503 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3504 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3505
3506 // Complex, from nxv4f32.
3507 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3508 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3509 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3510 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3511 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3512 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3513 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3514 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3515 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3516 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3517
3518 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3519 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3520 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3521 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3522 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3523
3524 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3525 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3526 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3527 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3528 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3529 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3530 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3531
3532 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3533 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3534 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3535 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3536 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3537
3538 // Complex, from nxv8f16.
3539 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3540 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3541 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3542 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3543 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3544 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3545 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3546 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3547 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3548 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3549
3550 // Complex, from nxv4f16.
3551 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3552 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3553 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3554 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3555 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3556 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3557 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3558 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3559
3560 // Complex, from nxv2f16.
3561 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3562 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3563 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3564 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3565 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3566 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3567 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3568 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3569
3570 // Truncate from nxvmf32 to nxvmf16.
3571 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3572 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3573 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3574
3575 // Truncate from nxvmf32 to nxvmbf16.
3576 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3577 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3578 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3579
3580 // Truncate from nxvmf64 to nxvmf16.
3581 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3582 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3583 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3584
3585 // Truncate from nxvmf64 to nxvmbf16.
3586 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3587 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3588 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3589
3590 // Truncate from nxvmf64 to nxvmf32.
3591 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3592 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3593 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3594
3595 // Extend from nxvmf16 to nxvmf32.
3596 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3597 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3598 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3599
3600 // Extend from nxvmbf16 to nxvmf32.
3601 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
3602 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
3603 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
3604
3605 // Extend from nxvmf16 to nxvmf64.
3606 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3607 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3608 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3609
3610 // Extend from nxvmbf16 to nxvmf64.
3611 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
3612 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
3613 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
3614
3615 // Extend from nxvmf32 to nxvmf64.
3616 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3617 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3618 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3619
3620 // Bitcasts from float to integer
3621 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3622 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3623 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3624
3625 // Bitcasts from integer to float
3626 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3627 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3628 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3629
3630 // Add cost for extending to illegal -too wide- scalable vectors.
3631 // zero/sign extend are implemented by multiple unpack operations,
3632 // where each operation has a cost of 1.
3633 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3634 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3635 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3636 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3637 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3638 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3639
3640 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3641 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3642 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3643 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3644 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3645 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3646 };
3647
3648 // We have to estimate a cost of fixed length operation upon
3649 // SVE registers(operations) with the number of registers required
3650 // for a fixed type to be represented upon SVE registers.
3651 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3652 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3653 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3654 ST->useSVEForFixedLengthVectors(WiderTy)) {
3655 std::pair<InstructionCost, MVT> LT =
3656 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3657 unsigned NumElements =
3658 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3659 return AdjustCost(
3660 LT.first *
3662 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
3663 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3664 CostKind, I));
3665 }
3666
3667 if (const auto *Entry = ConvertCostTableLookup(
3668 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3669 return AdjustCost(Entry->Cost);
3670
3671 static const TypeConversionCostTblEntry FP16Tbl[] = {
3672 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3673 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3674 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3675 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3676 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3677 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3678 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3679 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3680 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3681 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3682 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3683 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3684 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3685 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3686 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3687 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3688 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3689 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3690 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3691 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3692 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3693 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3694 };
3695
3696 if (ST->hasFullFP16())
3697 if (const auto *Entry = ConvertCostTableLookup(
3698 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3699 return AdjustCost(Entry->Cost);
3700
3701 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3702 // double-rounding issues.
3703 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3704 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3705 isa<FixedVectorType>(Dst) && isa<FixedVectorType>(Src))
3706 return AdjustCost(
3707 cast<FixedVectorType>(Dst)->getNumElements() *
3708 getCastInstrCost(Opcode, Dst->getScalarType(), Src->getScalarType(),
3709 CCH, CostKind) +
3710 BaseT::getScalarizationOverhead(cast<FixedVectorType>(Src), false, true,
3711 CostKind) +
3712 BaseT::getScalarizationOverhead(cast<FixedVectorType>(Dst), true, false,
3713 CostKind));
3714
3715 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3718 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3720 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3722 // The standard behaviour in the backend for these cases is to split the
3723 // extend up into two parts:
3724 // 1. Perform an extending load or masked load up to the legal type.
3725 // 2. Extend the loaded data to the final type.
3726 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3727 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3729 Opcode, LegalTy, Src, CCH, CostKind, I);
3731 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3732 return Part1 + Part2;
3733 }
3734
3735 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3736 // but we also want to include the TTI::CastContextHint::Masked case too.
3737 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3739 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3741
3742 return AdjustCost(
3743 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3744}
3745
3748 VectorType *VecTy, unsigned Index,
3750
3751 // Make sure we were given a valid extend opcode.
3752 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3753 "Invalid opcode");
3754
3755 // We are extending an element we extract from a vector, so the source type
3756 // of the extend is the element type of the vector.
3757 auto *Src = VecTy->getElementType();
3758
3759 // Sign- and zero-extends are for integer types only.
3760 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3761
3762 // Get the cost for the extract. We compute the cost (if any) for the extend
3763 // below.
3764 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3765 CostKind, Index, nullptr, nullptr);
3766
3767 // Legalize the types.
3768 auto VecLT = getTypeLegalizationCost(VecTy);
3769 auto DstVT = TLI->getValueType(DL, Dst);
3770 auto SrcVT = TLI->getValueType(DL, Src);
3771
3772 // If the resulting type is still a vector and the destination type is legal,
3773 // we may get the extension for free. If not, get the default cost for the
3774 // extend.
3775 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3776 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3777 CostKind);
3778
3779 // The destination type should be larger than the element type. If not, get
3780 // the default cost for the extend.
3781 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3782 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3783 CostKind);
3784
3785 switch (Opcode) {
3786 default:
3787 llvm_unreachable("Opcode should be either SExt or ZExt");
3788
3789 // For sign-extends, we only need a smov, which performs the extension
3790 // automatically.
3791 case Instruction::SExt:
3792 return Cost;
3793
3794 // For zero-extends, the extend is performed automatically by a umov unless
3795 // the destination type is i64 and the element type is i8 or i16.
3796 case Instruction::ZExt:
3797 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3798 return Cost;
3799 }
3800
3801 // If we are unable to perform the extend for free, get the default cost.
3802 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3803 CostKind);
3804}
3805
3808 const Instruction *I) const {
3810 return Opcode == Instruction::PHI ? 0 : 1;
3811 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3812 // Branches are assumed to be predicted.
3813 return 0;
3814}
3815
3816InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3817 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3818 const Instruction *I, Value *Scalar,
3819 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
3820 assert(Val->isVectorTy() && "This must be a vector type");
3821
3822 if (Index != -1U) {
3823 // Legalize the type.
3824 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3825
3826 // This type is legalized to a scalar type.
3827 if (!LT.second.isVector())
3828 return 0;
3829
3830 // The type may be split. For fixed-width vectors we can normalize the
3831 // index to the new type.
3832 if (LT.second.isFixedLengthVector()) {
3833 unsigned Width = LT.second.getVectorNumElements();
3834 Index = Index % Width;
3835 }
3836
3837 // The element at index zero is already inside the vector.
3838 // - For a insert-element or extract-element
3839 // instruction that extracts integers, an explicit FPR -> GPR move is
3840 // needed. So it has non-zero cost.
3841 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
3842 return 0;
3843
3844 // This is recognising a LD1 single-element structure to one lane of one
3845 // register instruction. I.e., if this is an `insertelement` instruction,
3846 // and its second operand is a load, then we will generate a LD1, which
3847 // are expensive instructions.
3848 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
3849 return CostKind == TTI::TCK_CodeSize
3850 ? 0
3852
3853 // i1 inserts and extract will include an extra cset or cmp of the vector
3854 // value. Increase the cost by 1 to account.
3855 if (Val->getScalarSizeInBits() == 1)
3856 return CostKind == TTI::TCK_CodeSize
3857 ? 2
3859
3860 // FIXME:
3861 // If the extract-element and insert-element instructions could be
3862 // simplified away (e.g., could be combined into users by looking at use-def
3863 // context), they have no cost. This is not done in the first place for
3864 // compile-time considerations.
3865 }
3866
3867 // In case of Neon, if there exists extractelement from lane != 0 such that
3868 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
3869 // 2. extractelement result feeds into fmul.
3870 // 3. Other operand of fmul is an extractelement from lane 0 or lane
3871 // equivalent to 0.
3872 // then the extractelement can be merged with fmul in the backend and it
3873 // incurs no cost.
3874 // e.g.
3875 // define double @foo(<2 x double> %a) {
3876 // %1 = extractelement <2 x double> %a, i32 0
3877 // %2 = extractelement <2 x double> %a, i32 1
3878 // %res = fmul double %1, %2
3879 // ret double %res
3880 // }
3881 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
3882 auto ExtractCanFuseWithFmul = [&]() {
3883 // We bail out if the extract is from lane 0.
3884 if (Index == 0)
3885 return false;
3886
3887 // Check if the scalar element type of the vector operand of ExtractElement
3888 // instruction is one of the allowed types.
3889 auto IsAllowedScalarTy = [&](const Type *T) {
3890 return T->isFloatTy() || T->isDoubleTy() ||
3891 (T->isHalfTy() && ST->hasFullFP16());
3892 };
3893
3894 // Check if the extractelement user is scalar fmul.
3895 auto IsUserFMulScalarTy = [](const Value *EEUser) {
3896 // Check if the user is scalar fmul.
3897 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
3898 return BO && BO->getOpcode() == BinaryOperator::FMul &&
3899 !BO->getType()->isVectorTy();
3900 };
3901
3902 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
3903 // certain scalar type and a certain vector register width.
3904 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
3905 auto RegWidth =
3907 .getFixedValue();
3908 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
3909 };
3910
3911 // Check if the type constraints on input vector type and result scalar type
3912 // of extractelement instruction are satisfied.
3913 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
3914 return false;
3915
3916 if (Scalar) {
3917 DenseMap<User *, unsigned> UserToExtractIdx;
3918 for (auto *U : Scalar->users()) {
3919 if (!IsUserFMulScalarTy(U))
3920 return false;
3921 // Recording entry for the user is important. Index value is not
3922 // important.
3923 UserToExtractIdx[U];
3924 }
3925 if (UserToExtractIdx.empty())
3926 return false;
3927 for (auto &[S, U, L] : ScalarUserAndIdx) {
3928 for (auto *U : S->users()) {
3929 if (UserToExtractIdx.contains(U)) {
3930 auto *FMul = cast<BinaryOperator>(U);
3931 auto *Op0 = FMul->getOperand(0);
3932 auto *Op1 = FMul->getOperand(1);
3933 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
3934 UserToExtractIdx[U] = L;
3935 break;
3936 }
3937 }
3938 }
3939 }
3940 for (auto &[U, L] : UserToExtractIdx) {
3941 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
3942 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
3943 return false;
3944 }
3945 } else {
3946 const auto *EE = cast<ExtractElementInst>(I);
3947
3948 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
3949 if (!IdxOp)
3950 return false;
3951
3952 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
3953 if (!IsUserFMulScalarTy(U))
3954 return false;
3955
3956 // Check if the other operand of extractelement is also extractelement
3957 // from lane equivalent to 0.
3958 const auto *BO = cast<BinaryOperator>(U);
3959 const auto *OtherEE = dyn_cast<ExtractElementInst>(
3960 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
3961 if (OtherEE) {
3962 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
3963 if (!IdxOp)
3964 return false;
3965 return IsExtractLaneEquivalentToZero(
3966 cast<ConstantInt>(OtherEE->getIndexOperand())
3967 ->getValue()
3968 .getZExtValue(),
3969 OtherEE->getType()->getScalarSizeInBits());
3970 }
3971 return true;
3972 });
3973 }
3974 return true;
3975 };
3976
3977 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
3978 ExtractCanFuseWithFmul())
3979 return 0;
3980
3981 // All other insert/extracts cost this much.
3982 return CostKind == TTI::TCK_CodeSize ? 1
3983 : ST->getVectorInsertExtractBaseCost();
3984}
3985
3988 unsigned Index,
3989 const Value *Op0,
3990 const Value *Op1) const {
3991 // Treat insert at lane 0 into a poison vector as having zero cost. This
3992 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
3993 // single dup) are treated as cheap.
3994 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
3995 isa<PoisonValue>(Op0))
3996 return 0;
3997 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
3998}
3999
4001 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4002 Value *Scalar,
4003 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
4004 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4005 ScalarUserAndIdx);
4006}
4007
4009 Type *Val,
4011 unsigned Index) const {
4012 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
4013}
4014
4018 unsigned Index) const {
4019 if (isa<FixedVectorType>(Val))
4021 Index);
4022
4023 // This typically requires both while and lastb instructions in order
4024 // to extract the last element. If this is in a loop the while
4025 // instruction can at least be hoisted out, although it will consume a
4026 // predicate register. The cost should be more expensive than the base
4027 // extract cost, which is 2 for most CPUs.
4028 return CostKind == TTI::TCK_CodeSize
4029 ? 2
4031}
4032
4034 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4035 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
4036 ArrayRef<Value *> VL) const {
4037 if (isa<ScalableVectorType>(Ty))
4039 if (Ty->getElementType()->isFloatingPointTy())
4040 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4041 CostKind);
4042 unsigned VecInstCost =
4044 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4045}
4046
4047std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4049 TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
4050 std::function<InstructionCost(Type *)> InstCost) const {
4051 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4052 return std::nullopt;
4053 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4054 return std::nullopt;
4055
4056 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4057 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4059 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4060 Cost *= 2;
4061 Cost += InstCost(PromotedTy);
4062 if (IncludeTrunc)
4063 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4065 return Cost;
4066}
4067
4069 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4071 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4072
4073 // The code-generator is currently not able to handle scalable vectors
4074 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4075 // it. This change will be removed when code-generation for these types is
4076 // sufficiently reliable.
4077 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4078 if (VTy->getElementCount() == ElementCount::getScalable(1))
4080
4081 // TODO: Handle more cost kinds.
4083 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4084 Op2Info, Args, CxtI);
4085
4086 // Legalize the type.
4087 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4088 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4089
4090 // Increase the cost for half and bfloat types if not architecturally
4091 // supported.
4092 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4093 ISD == ISD::FDIV || ISD == ISD::FREM)
4094 if (auto PromotedCost = getFP16BF16PromoteCost(
4095 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4096 [&](Type *PromotedTy) {
4097 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4098 Op1Info, Op2Info);
4099 }))
4100 return *PromotedCost;
4101
4102 switch (ISD) {
4103 default:
4104 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4105 Op2Info);
4106 case ISD::SREM:
4107 case ISD::SDIV:
4108 /*
4109 Notes for sdiv/srem specific costs:
4110 1. This only considers the cases where the divisor is constant, uniform and
4111 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4112 result in some form of (ldr + adrp), corresponding to constant vectors, or
4113 scalarization of the division operation.
4114 2. Constant divisors, either negative in whole or partially, don't result in
4115 significantly different codegen as compared to positive constant divisors.
4116 So, we don't consider negative divisors separately.
4117 3. If the codegen is significantly different with SVE, it has been indicated
4118 using comments at appropriate places.
4119
4120 sdiv specific cases:
4121 -----------------------------------------------------------------------
4122 codegen | pow-of-2 | Type
4123 -----------------------------------------------------------------------
4124 add + cmp + csel + asr | Y | i64
4125 add + cmp + csel + asr | Y | i32
4126 -----------------------------------------------------------------------
4127
4128 srem specific cases:
4129 -----------------------------------------------------------------------
4130 codegen | pow-of-2 | Type
4131 -----------------------------------------------------------------------
4132 negs + and + and + csneg | Y | i64
4133 negs + and + and + csneg | Y | i32
4134 -----------------------------------------------------------------------
4135
4136 other sdiv/srem cases:
4137 -------------------------------------------------------------------------
4138 common codegen | + srem | + sdiv | pow-of-2 | Type
4139 -------------------------------------------------------------------------
4140 smulh + asr + add + add | - | - | N | i64
4141 smull + lsr + add + add | - | - | N | i32
4142 usra | and + sub | sshr | Y | <2 x i64>
4143 2 * (scalar code) | - | - | N | <2 x i64>
4144 usra | bic + sub | sshr + neg | Y | <4 x i32>
4145 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4146 + sshr + usra | | | |
4147 -------------------------------------------------------------------------
4148 */
4149 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4150 InstructionCost AddCost =
4151 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4152 Op1Info.getNoProps(), Op2Info.getNoProps());
4153 InstructionCost AsrCost =
4154 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4155 Op1Info.getNoProps(), Op2Info.getNoProps());
4156 InstructionCost MulCost =
4157 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4158 Op1Info.getNoProps(), Op2Info.getNoProps());
4159 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4160 // have similar cost.
4161 auto VT = TLI->getValueType(DL, Ty);
4162 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4163 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4164 // Neg can be folded into the asr instruction.
4165 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4166 : (3 * AsrCost + AddCost);
4167 } else {
4168 return MulCost + AsrCost + 2 * AddCost;
4169 }
4170 } else if (VT.isVector()) {
4171 InstructionCost UsraCost = 2 * AsrCost;
4172 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4173 // Division with scalable types corresponds to native 'asrd'
4174 // instruction when SVE is available.
4175 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4176
4177 // One more for the negation in SDIV
4179 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4180 if (Ty->isScalableTy() && ST->hasSVE())
4181 Cost += 2 * AsrCost;
4182 else {
4183 Cost +=
4184 UsraCost +
4185 (ISD == ISD::SDIV
4186 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4187 : 2 * AddCost);
4188 }
4189 return Cost;
4190 } else if (LT.second == MVT::v2i64) {
4191 return VT.getVectorNumElements() *
4193 Op1Info.getNoProps(),
4194 Op2Info.getNoProps());
4195 } else {
4196 // When SVE is available, we get:
4197 // smulh + lsr + add/sub + asr + add/sub.
4198 if (Ty->isScalableTy() && ST->hasSVE())
4199 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4200 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4201 }
4202 }
4203 }
4204 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4205 LT.second.isFixedLengthVector()) {
4206 // FIXME: When the constant vector is non-uniform, this may result in
4207 // loading the vector from constant pool or in some cases, may also result
4208 // in scalarization. For now, we are approximating this with the
4209 // scalarization cost.
4210 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4211 CostKind, -1, nullptr, nullptr);
4212 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4213 CostKind, -1, nullptr, nullptr);
4214 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4215 return ExtractCost + InsertCost +
4216 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4217 CostKind, Op1Info.getNoProps(),
4218 Op2Info.getNoProps());
4219 }
4220 [[fallthrough]];
4221 case ISD::UDIV:
4222 case ISD::UREM: {
4223 auto VT = TLI->getValueType(DL, Ty);
4224 if (Op2Info.isConstant()) {
4225 // If the operand is a power of 2 we can use the shift or and cost.
4226 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4227 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4228 Op1Info.getNoProps(),
4229 Op2Info.getNoProps());
4230 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4231 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4232 Op1Info.getNoProps(),
4233 Op2Info.getNoProps());
4234
4235 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4236 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4237 // The MULHU will be expanded to UMULL for the types not listed below,
4238 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4239 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4240 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4241 LT.second == MVT::nxv16i8;
4242 bool Is128bit = LT.second.is128BitVector();
4243
4244 InstructionCost MulCost =
4245 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4246 Op1Info.getNoProps(), Op2Info.getNoProps());
4247 InstructionCost AddCost =
4248 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4249 Op1Info.getNoProps(), Op2Info.getNoProps());
4250 InstructionCost ShrCost =
4251 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4252 Op1Info.getNoProps(), Op2Info.getNoProps());
4253 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4254 (HasMULH ? 0 : ShrCost) + // UMULL shift
4255 AddCost * 2 + ShrCost;
4256 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4257 }
4258 }
4259
4260 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4261 // emitted by the backend even when those functions are not declared in the
4262 // module.
4263 if (!VT.isVector() && VT.getSizeInBits() > 64)
4264 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4265
4267 Opcode, Ty, CostKind, Op1Info, Op2Info);
4268 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4269 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4270 // SDIV/UDIV operations are lowered using SVE, then we can have less
4271 // costs.
4272 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4273 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4274 static const CostTblEntry DivTbl[]{
4275 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4276 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4277 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4278 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4279 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4280 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4281
4282 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4283 if (nullptr != Entry)
4284 return Entry->Cost;
4285 }
4286 // For 8/16-bit elements, the cost is higher because the type
4287 // requires promotion and possibly splitting:
4288 if (LT.second.getScalarType() == MVT::i8)
4289 Cost *= 8;
4290 else if (LT.second.getScalarType() == MVT::i16)
4291 Cost *= 4;
4292 return Cost;
4293 } else {
4294 // If one of the operands is a uniform constant then the cost for each
4295 // element is Cost for insertion, extraction and division.
4296 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4297 // operation with scalar type
4298 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4299 (Op2Info.isConstant() && Op2Info.isUniform())) {
4300 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4302 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4303 return (4 + DivCost) * VTy->getNumElements();
4304 }
4305 }
4306 // On AArch64, without SVE, vector divisions are expanded
4307 // into scalar divisions of each pair of elements.
4308 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4309 -1, nullptr, nullptr);
4310 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4311 nullptr, nullptr);
4312 }
4313
4314 // TODO: if one of the arguments is scalar, then it's not necessary to
4315 // double the cost of handling the vector elements.
4316 Cost += Cost;
4317 }
4318 return Cost;
4319 }
4320 case ISD::MUL:
4321 // When SVE is available, then we can lower the v2i64 operation using
4322 // the SVE mul instruction, which has a lower cost.
4323 if (LT.second == MVT::v2i64 && ST->hasSVE())
4324 return LT.first;
4325
4326 // When SVE is not available, there is no MUL.2d instruction,
4327 // which means mul <2 x i64> is expensive as elements are extracted
4328 // from the vectors and the muls scalarized.
4329 // As getScalarizationOverhead is a bit too pessimistic, we
4330 // estimate the cost for a i64 vector directly here, which is:
4331 // - four 2-cost i64 extracts,
4332 // - two 2-cost i64 inserts, and
4333 // - two 1-cost muls.
4334 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4335 // LT.first = 2 the cost is 28. If both operands are extensions it will not
4336 // need to scalarize so the cost can be cheaper (smull or umull).
4337 // so the cost can be cheaper (smull or umull).
4338 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
4339 return LT.first;
4340 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4342 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4343 nullptr, nullptr) *
4344 2 +
4345 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4346 nullptr, nullptr));
4347 case ISD::ADD:
4348 case ISD::XOR:
4349 case ISD::OR:
4350 case ISD::AND:
4351 case ISD::SRL:
4352 case ISD::SRA:
4353 case ISD::SHL:
4354 // These nodes are marked as 'custom' for combining purposes only.
4355 // We know that they are legal. See LowerAdd in ISelLowering.
4356 return LT.first;
4357
4358 case ISD::FNEG:
4359 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4360 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4361 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4362 CxtI &&
4363 ((CxtI->hasOneUse() &&
4364 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4365 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4366 return 0;
4367 [[fallthrough]];
4368 case ISD::FADD:
4369 case ISD::FSUB:
4370 if (!Ty->getScalarType()->isFP128Ty())
4371 return LT.first;
4372 [[fallthrough]];
4373 case ISD::FMUL:
4374 case ISD::FDIV:
4375 // These nodes are marked as 'custom' just to lower them to SVE.
4376 // We know said lowering will incur no additional cost.
4377 if (!Ty->getScalarType()->isFP128Ty())
4378 return 2 * LT.first;
4379
4380 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4381 Op2Info);
4382 case ISD::FREM:
4383 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4384 // those functions are not declared in the module.
4385 if (!Ty->isVectorTy())
4386 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4387 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4388 Op2Info);
4389 }
4390}
4391
4394 const SCEV *Ptr,
4396 // Address computations in vectorized code with non-consecutive addresses will
4397 // likely result in more instructions compared to scalar code where the
4398 // computation can more often be merged into the index mode. The resulting
4399 // extra micro-ops can significantly decrease throughput.
4400 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4401 int MaxMergeDistance = 64;
4402
4403 if (PtrTy->isVectorTy() && SE &&
4404 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4405 return NumVectorInstToHideOverhead;
4406
4407 // In many cases the address computation is not merged into the instruction
4408 // addressing mode.
4409 return 1;
4410}
4411
4413 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4415 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4416 // We don't lower some vector selects well that are wider than the register
4417 // width. TODO: Improve this with different cost kinds.
4418 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4419 // We would need this many instructions to hide the scalarization happening.
4420 const int AmortizationCost = 20;
4421
4422 // If VecPred is not set, check if we can get a predicate from the context
4423 // instruction, if its type matches the requested ValTy.
4424 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4425 CmpPredicate CurrentPred;
4426 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4427 m_Value())))
4428 VecPred = CurrentPred;
4429 }
4430 // Check if we have a compare/select chain that can be lowered using
4431 // a (F)CMxx & BFI pair.
4432 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4433 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4434 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4435 VecPred == CmpInst::FCMP_UNE) {
4436 static const auto ValidMinMaxTys = {
4437 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4438 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4439 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4440
4441 auto LT = getTypeLegalizationCost(ValTy);
4442 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
4443 (ST->hasFullFP16() &&
4444 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
4445 return LT.first;
4446 }
4447
4448 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4449 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4450 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4451 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4452 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4453 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4454 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4455 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4456 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4457 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4458 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4459 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4460
4461 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4462 EVT SelValTy = TLI->getValueType(DL, ValTy);
4463 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4464 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4465 SelCondTy.getSimpleVT(),
4466 SelValTy.getSimpleVT()))
4467 return Entry->Cost;
4468 }
4469 }
4470
4471 if (Opcode == Instruction::FCmp) {
4472 if (auto PromotedCost = getFP16BF16PromoteCost(
4473 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4474 [&](Type *PromotedTy) {
4476 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4477 CostKind, Op1Info, Op2Info);
4478 if (isa<VectorType>(PromotedTy))
4480 Instruction::Trunc,
4481 VectorType::getInteger(cast<VectorType>(ValTy)),
4482 VectorType::getInteger(cast<VectorType>(PromotedTy)),
4484 return Cost;
4485 }))
4486 return *PromotedCost;
4487
4488 auto LT = getTypeLegalizationCost(ValTy);
4489 // Model unknown fp compares as a libcall.
4490 if (LT.second.getScalarType() != MVT::f64 &&
4491 LT.second.getScalarType() != MVT::f32 &&
4492 LT.second.getScalarType() != MVT::f16)
4493 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4494 {ValTy, ValTy}, CostKind);
4495
4496 // Some comparison operators require expanding to multiple compares + or.
4497 unsigned Factor = 1;
4498 if (!CondTy->isVectorTy() &&
4499 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4500 Factor = 2; // fcmp with 2 selects
4501 else if (isa<FixedVectorType>(ValTy) &&
4502 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4503 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4504 Factor = 3; // fcmxx+fcmyy+or
4505 else if (isa<ScalableVectorType>(ValTy) &&
4506 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4507 Factor = 3; // fcmxx+fcmyy+or
4508
4509 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4510 }
4511
4512 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4513 // icmp(and, 0) as free, as we can make use of ands, but only if the
4514 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4515 // providing it will not cause performance regressions.
4516 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4517 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4518 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4519 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4520 if (match(I->getOperand(1), m_Zero()))
4521 return 0;
4522
4523 // x >= 1 / x < 1 -> x > 0 / x <= 0
4524 if (match(I->getOperand(1), m_One()) &&
4525 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4526 return 0;
4527
4528 // x <= -1 / x > -1 -> x > 0 / x <= 0
4529 if (match(I->getOperand(1), m_AllOnes()) &&
4530 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4531 return 0;
4532 }
4533
4534 // The base case handles scalable vectors fine for now, since it treats the
4535 // cost as 1 * legalization cost.
4536 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4537 Op1Info, Op2Info, I);
4538}
4539
4541AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4543 if (ST->requiresStrictAlign()) {
4544 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4545 // a bunch of instructions when strict align is enabled.
4546 return Options;
4547 }
4548 Options.AllowOverlappingLoads = true;
4549 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4550 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4551 // TODO: Though vector loads usually perform well on AArch64, in some targets
4552 // they may wake up the FP unit, which raises the power consumption. Perhaps
4553 // they could be used with no holds barred (-O3).
4554 Options.LoadSizes = {8, 4, 2, 1};
4555 Options.AllowedTailExpansions = {3, 5, 6};
4556 return Options;
4557}
4558
4560 return ST->hasSVE();
4561}
4562
4565 Align Alignment, unsigned AddressSpace,
4567 if (useNeonVector(Src))
4568 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4569 CostKind);
4570 auto LT = getTypeLegalizationCost(Src);
4571 if (!LT.first.isValid())
4573
4574 // Return an invalid cost for element types that we are unable to lower.
4575 auto *VT = cast<VectorType>(Src);
4576 if (VT->getElementType()->isIntegerTy(1))
4578
4579 // The code-generator is currently not able to handle scalable vectors
4580 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4581 // it. This change will be removed when code-generation for these types is
4582 // sufficiently reliable.
4583 if (VT->getElementCount() == ElementCount::getScalable(1))
4585
4586 return LT.first;
4587}
4588
4589// This function returns gather/scatter overhead either from
4590// user-provided value or specialized values per-target from \p ST.
4591static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4592 const AArch64Subtarget *ST) {
4593 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4594 "Should be called on only load or stores.");
4595 switch (Opcode) {
4596 case Instruction::Load:
4597 if (SVEGatherOverhead.getNumOccurrences() > 0)
4598 return SVEGatherOverhead;
4599 return ST->getGatherOverhead();
4600 break;
4601 case Instruction::Store:
4602 if (SVEScatterOverhead.getNumOccurrences() > 0)
4603 return SVEScatterOverhead;
4604 return ST->getScatterOverhead();
4605 break;
4606 default:
4607 llvm_unreachable("Shouldn't have reached here");
4608 }
4609}
4610
4612 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
4613 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
4614 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
4615 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
4616 Alignment, CostKind, I);
4617 auto *VT = cast<VectorType>(DataTy);
4618 auto LT = getTypeLegalizationCost(DataTy);
4619 if (!LT.first.isValid())
4621
4622 // Return an invalid cost for element types that we are unable to lower.
4623 if (!LT.second.isVector() ||
4624 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
4625 VT->getElementType()->isIntegerTy(1))
4627
4628 // The code-generator is currently not able to handle scalable vectors
4629 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4630 // it. This change will be removed when code-generation for these types is
4631 // sufficiently reliable.
4632 if (VT->getElementCount() == ElementCount::getScalable(1))
4634
4635 ElementCount LegalVF = LT.second.getVectorElementCount();
4636 InstructionCost MemOpCost =
4637 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
4638 {TTI::OK_AnyValue, TTI::OP_None}, I);
4639 // Add on an overhead cost for using gathers/scatters.
4640 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4641 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
4642}
4643
4645 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
4646}
4647
4649 Align Alignment,
4650 unsigned AddressSpace,
4652 TTI::OperandValueInfo OpInfo,
4653 const Instruction *I) const {
4654 EVT VT = TLI->getValueType(DL, Ty, true);
4655 // Type legalization can't handle structs
4656 if (VT == MVT::Other)
4657 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
4658 CostKind);
4659
4660 auto LT = getTypeLegalizationCost(Ty);
4661 if (!LT.first.isValid())
4663
4664 // The code-generator is currently not able to handle scalable vectors
4665 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4666 // it. This change will be removed when code-generation for these types is
4667 // sufficiently reliable.
4668 // We also only support full register predicate loads and stores.
4669 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4670 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
4671 (VTy->getElementType()->isIntegerTy(1) &&
4672 !VTy->getElementCount().isKnownMultipleOf(
4675
4676 // TODO: consider latency as well for TCK_SizeAndLatency.
4678 return LT.first;
4679
4681 return 1;
4682
4683 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
4684 LT.second.is128BitVector() && Alignment < Align(16)) {
4685 // Unaligned stores are extremely inefficient. We don't split all
4686 // unaligned 128-bit stores because the negative impact that has shown in
4687 // practice on inlined block copy code.
4688 // We make such stores expensive so that we will only vectorize if there
4689 // are 6 other instructions getting vectorized.
4690 const int AmortizationCost = 6;
4691
4692 return LT.first * 2 * AmortizationCost;
4693 }
4694
4695 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
4696 if (Ty->isPtrOrPtrVectorTy())
4697 return LT.first;
4698
4699 if (useNeonVector(Ty)) {
4700 // Check truncating stores and extending loads.
4701 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
4702 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
4703 if (VT == MVT::v4i8)
4704 return 2;
4705 // Otherwise we need to scalarize.
4706 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
4707 }
4708 EVT EltVT = VT.getVectorElementType();
4709 unsigned EltSize = EltVT.getScalarSizeInBits();
4710 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
4711 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
4712 return LT.first;
4713 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
4714 // widening to v4i8, which produces suboptimal results.
4715 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
4716 return LT.first;
4717
4718 // Check non-power-of-2 loads/stores for legal vector element types with
4719 // NEON. Non-power-of-2 memory ops will get broken down to a set of
4720 // operations on smaller power-of-2 ops, including ld1/st1.
4721 LLVMContext &C = Ty->getContext();
4723 SmallVector<EVT> TypeWorklist;
4724 TypeWorklist.push_back(VT);
4725 while (!TypeWorklist.empty()) {
4726 EVT CurrVT = TypeWorklist.pop_back_val();
4727 unsigned CurrNumElements = CurrVT.getVectorNumElements();
4728 if (isPowerOf2_32(CurrNumElements)) {
4729 Cost += 1;
4730 continue;
4731 }
4732
4733 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
4734 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
4735 TypeWorklist.push_back(
4736 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
4737 }
4738 return Cost;
4739 }
4740
4741 return LT.first;
4742}
4743
4745 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
4746 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4747 bool UseMaskForCond, bool UseMaskForGaps) const {
4748 assert(Factor >= 2 && "Invalid interleave factor");
4749 auto *VecVTy = cast<VectorType>(VecTy);
4750
4751 if (VecTy->isScalableTy() && !ST->hasSVE())
4753
4754 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
4755 // only have lowering for power-of-2 factors.
4756 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
4757 // InterleavedAccessPass for ld3/st3
4758 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
4760
4761 // Vectorization for masked interleaved accesses is only enabled for scalable
4762 // VF.
4763 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
4765
4766 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
4767 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
4768 auto *SubVecTy =
4769 VectorType::get(VecVTy->getElementType(),
4770 VecVTy->getElementCount().divideCoefficientBy(Factor));
4771
4772 // ldN/stN only support legal vector types of size 64 or 128 in bits.
4773 // Accesses having vector types that are a multiple of 128 bits can be
4774 // matched to more than one ldN/stN instruction.
4775 bool UseScalable;
4776 if (MinElts % Factor == 0 &&
4777 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
4778 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
4779 }
4780
4781 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4782 Alignment, AddressSpace, CostKind,
4783 UseMaskForCond, UseMaskForGaps);
4784}
4785
4790 for (auto *I : Tys) {
4791 if (!I->isVectorTy())
4792 continue;
4793 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
4794 128)
4795 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
4796 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
4797 }
4798 return Cost;
4799}
4800
4802 return ST->getMaxInterleaveFactor();
4803}
4804
4805// For Falkor, we want to avoid having too many strided loads in a loop since
4806// that can exhaust the HW prefetcher resources. We adjust the unroller
4807// MaxCount preference below to attempt to ensure unrolling doesn't create too
4808// many strided loads.
4809static void
4812 enum { MaxStridedLoads = 7 };
4813 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
4814 int StridedLoads = 0;
4815 // FIXME? We could make this more precise by looking at the CFG and
4816 // e.g. not counting loads in each side of an if-then-else diamond.
4817 for (const auto BB : L->blocks()) {
4818 for (auto &I : *BB) {
4819 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
4820 if (!LMemI)
4821 continue;
4822
4823 Value *PtrValue = LMemI->getPointerOperand();
4824 if (L->isLoopInvariant(PtrValue))
4825 continue;
4826
4827 const SCEV *LSCEV = SE.getSCEV(PtrValue);
4828 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
4829 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
4830 continue;
4831
4832 // FIXME? We could take pairing of unrolled load copies into account
4833 // by looking at the AddRec, but we would probably have to limit this
4834 // to loops with no stores or other memory optimization barriers.
4835 ++StridedLoads;
4836 // We've seen enough strided loads that seeing more won't make a
4837 // difference.
4838 if (StridedLoads > MaxStridedLoads / 2)
4839 return StridedLoads;
4840 }
4841 }
4842 return StridedLoads;
4843 };
4844
4845 int StridedLoads = countStridedLoads(L, SE);
4846 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
4847 << " strided loads\n");
4848 // Pick the largest power of 2 unroll count that won't result in too many
4849 // strided loads.
4850 if (StridedLoads) {
4851 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
4852 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
4853 << UP.MaxCount << '\n');
4854 }
4855}
4856
4857// This function returns true if the loop:
4858// 1. Has a valid cost, and
4859// 2. Has a cost within the supplied budget.
4860// Otherwise it returns false.
4862 InstructionCost Budget,
4863 unsigned *FinalSize) {
4864 // Estimate the size of the loop.
4865 InstructionCost LoopCost = 0;
4866
4867 for (auto *BB : L->getBlocks()) {
4868 for (auto &I : *BB) {
4869 SmallVector<const Value *, 4> Operands(I.operand_values());
4872 // This can happen with intrinsics that don't currently have a cost model
4873 // or for some operations that require SVE.
4874 if (!Cost.isValid())
4875 return false;
4876
4877 LoopCost += Cost;
4878 if (LoopCost > Budget)
4879 return false;
4880 }
4881 }
4882
4883 if (FinalSize)
4884 *FinalSize = LoopCost.getValue();
4885 return true;
4886}
4887
4889 const AArch64TTIImpl &TTI) {
4890 // Only consider loops with unknown trip counts for which we can determine
4891 // a symbolic expression. Multi-exit loops with small known trip counts will
4892 // likely be unrolled anyway.
4893 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
4894 if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))
4895 return false;
4896
4897 // It might not be worth unrolling loops with low max trip counts. Restrict
4898 // this to max trip counts > 32 for now.
4899 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
4900 if (MaxTC > 0 && MaxTC <= 32)
4901 return false;
4902
4903 // Make sure the loop size is <= 5.
4904 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
4905 return false;
4906
4907 // Small search loops with multiple exits can be highly beneficial to unroll.
4908 // We only care about loops with exactly two exiting blocks, although each
4909 // block could jump to the same exit block.
4910 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
4911 if (Blocks.size() != 2)
4912 return false;
4913
4914 if (any_of(Blocks, [](BasicBlock *BB) {
4915 return !isa<BranchInst>(BB->getTerminator());
4916 }))
4917 return false;
4918
4919 return true;
4920}
4921
4922/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
4923/// OOO engine's wide instruction window and various predictors.
4924static void
4927 const AArch64TTIImpl &TTI) {
4928 // Limit loops with structure that is highly likely to benefit from runtime
4929 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
4930 // likely with complex control flow). Note that the heuristics here may be
4931 // overly conservative and we err on the side of avoiding runtime unrolling
4932 // rather than unroll excessively. They are all subject to further refinement.
4933 if (!L->isInnermost() || L->getNumBlocks() > 8)
4934 return;
4935
4936 // Loops with multiple exits are handled by common code.
4937 if (!L->getExitBlock())
4938 return;
4939
4940 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
4941 if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
4942 (SE.getSmallConstantMaxTripCount(L) > 0 &&
4943 SE.getSmallConstantMaxTripCount(L) <= 32))
4944 return;
4945
4946 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
4947 return;
4948
4950 return;
4951
4952 // Limit to loops with trip counts that are cheap to expand.
4953 UP.SCEVExpansionBudget = 1;
4954
4955 // Try to unroll small loops, of few-blocks with low budget, if they have
4956 // load/store dependencies, to expose more parallel memory access streams,
4957 // or if they do little work inside a block (i.e. load -> X -> store pattern).
4958 BasicBlock *Header = L->getHeader();
4959 BasicBlock *Latch = L->getLoopLatch();
4960 if (Header == Latch) {
4961 // Estimate the size of the loop.
4962 unsigned Size;
4963 unsigned Width = 10;
4964 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
4965 return;
4966
4967 // Try to find an unroll count that maximizes the use of the instruction
4968 // window, i.e. trying to fetch as many instructions per cycle as possible.
4969 unsigned MaxInstsPerLine = 16;
4970 unsigned UC = 1;
4971 unsigned BestUC = 1;
4972 unsigned SizeWithBestUC = BestUC * Size;
4973 while (UC <= 8) {
4974 unsigned SizeWithUC = UC * Size;
4975 if (SizeWithUC > 48)
4976 break;
4977 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4978 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4979 BestUC = UC;
4980 SizeWithBestUC = BestUC * Size;
4981 }
4982 UC++;
4983 }
4984
4985 if (BestUC == 1)
4986 return;
4987
4988 SmallPtrSet<Value *, 8> LoadedValuesPlus;
4990 for (auto *BB : L->blocks()) {
4991 for (auto &I : *BB) {
4993 if (!Ptr)
4994 continue;
4995 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4996 if (SE.isLoopInvariant(PtrSCEV, L))
4997 continue;
4998 if (isa<LoadInst>(&I)) {
4999 LoadedValuesPlus.insert(&I);
5000 // Include in-loop 1st users of loaded values.
5001 for (auto *U : I.users())
5002 if (L->contains(cast<Instruction>(U)))
5003 LoadedValuesPlus.insert(U);
5004 } else
5005 Stores.push_back(cast<StoreInst>(&I));
5006 }
5007 }
5008
5009 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5010 return LoadedValuesPlus.contains(SI->getOperand(0));
5011 }))
5012 return;
5013
5014 UP.Runtime = true;
5015 UP.DefaultUnrollRuntimeCount = BestUC;
5016 return;
5017 }
5018
5019 // Try to runtime-unroll loops with early-continues depending on loop-varying
5020 // loads; this helps with branch-prediction for the early-continues.
5021 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
5023 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
5024 !llvm::is_contained(Preds, Header) ||
5025 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5026 return;
5027
5028 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5029 [&](Instruction *I, unsigned Depth) -> bool {
5030 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5031 return false;
5032
5033 if (isa<LoadInst>(I))
5034 return true;
5035
5036 return any_of(I->operands(), [&](Value *V) {
5037 auto *I = dyn_cast<Instruction>(V);
5038 return I && DependsOnLoopLoad(I, Depth + 1);
5039 });
5040 };
5041 CmpPredicate Pred;
5042 Instruction *I;
5043 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5044 m_Value())) &&
5045 DependsOnLoopLoad(I, 0)) {
5046 UP.Runtime = true;
5047 }
5048}
5049
5052 OptimizationRemarkEmitter *ORE) const {
5053 // Enable partial unrolling and runtime unrolling.
5054 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5055
5056 UP.UpperBound = true;
5057
5058 // For inner loop, it is more likely to be a hot one, and the runtime check
5059 // can be promoted out from LICM pass, so the overhead is less, let's try
5060 // a larger threshold to unroll more loops.
5061 if (L->getLoopDepth() > 1)
5062 UP.PartialThreshold *= 2;
5063
5064 // Disable partial & runtime unrolling on -Os.
5066
5067 // Scan the loop: don't unroll loops with calls as this could prevent
5068 // inlining. Don't unroll auto-vectorized loops either, though do allow
5069 // unrolling of the scalar remainder.
5070 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5071 for (auto *BB : L->getBlocks()) {
5072 for (auto &I : *BB) {
5073 // Both auto-vectorized loops and the scalar remainder have the
5074 // isvectorized attribute, so differentiate between them by the presence
5075 // of vector instructions.
5076 if (IsVectorized && I.getType()->isVectorTy())
5077 return;
5078 if (isa<CallBase>(I)) {
5079 if (isa<CallInst>(I) || isa<InvokeInst>(I))
5080 if (const Function *F = cast<CallBase>(I).getCalledFunction())
5081 if (!isLoweredToCall(F))
5082 continue;
5083 return;
5084 }
5085 }
5086 }
5087
5088 // Apply subtarget-specific unrolling preferences.
5089 switch (ST->getProcFamily()) {
5090 case AArch64Subtarget::AppleA14:
5091 case AArch64Subtarget::AppleA15:
5092 case AArch64Subtarget::AppleA16:
5093 case AArch64Subtarget::AppleM4:
5094 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5095 break;
5096 case AArch64Subtarget::Falkor:
5099 break;
5100 default:
5101 break;
5102 }
5103
5104 // If this is a small, multi-exit loop similar to something like std::find,
5105 // then there is typically a performance improvement achieved by unrolling.
5106 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5107 UP.RuntimeUnrollMultiExit = true;
5108 UP.Runtime = true;
5109 // Limit unroll count.
5111 // Allow slightly more costly trip-count expansion to catch search loops
5112 // with pointer inductions.
5113 UP.SCEVExpansionBudget = 5;
5114 return;
5115 }
5116
5117 // Enable runtime unrolling for in-order models
5118 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5119 // checking for that case, we can ensure that the default behaviour is
5120 // unchanged
5122 !ST->getSchedModel().isOutOfOrder()) {
5123 UP.Runtime = true;
5124 UP.Partial = true;
5125 UP.UnrollRemainder = true;
5127
5128 UP.UnrollAndJam = true;
5130 }
5131}
5132
5134 TTI::PeelingPreferences &PP) const {
5136}
5137
5139 Type *ExpectedType,
5140 bool CanCreate) const {
5141 switch (Inst->getIntrinsicID()) {
5142 default:
5143 return nullptr;
5144 case Intrinsic::aarch64_neon_st2:
5145 case Intrinsic::aarch64_neon_st3:
5146 case Intrinsic::aarch64_neon_st4: {
5147 // Create a struct type
5148 StructType *ST = dyn_cast<StructType>(ExpectedType);
5149 if (!CanCreate || !ST)
5150 return nullptr;
5151 unsigned NumElts = Inst->arg_size() - 1;
5152 if (ST->getNumElements() != NumElts)
5153 return nullptr;
5154 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5155 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5156 return nullptr;
5157 }
5158 Value *Res = PoisonValue::get(ExpectedType);
5159 IRBuilder<> Builder(Inst);
5160 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5161 Value *L = Inst->getArgOperand(i);
5162 Res = Builder.CreateInsertValue(Res, L, i);
5163 }
5164 return Res;
5165 }
5166 case Intrinsic::aarch64_neon_ld2:
5167 case Intrinsic::aarch64_neon_ld3:
5168 case Intrinsic::aarch64_neon_ld4:
5169 if (Inst->getType() == ExpectedType)
5170 return Inst;
5171 return nullptr;
5172 }
5173}
5174
5176 MemIntrinsicInfo &Info) const {
5177 switch (Inst->getIntrinsicID()) {
5178 default:
5179 break;
5180 case Intrinsic::aarch64_neon_ld2:
5181 case Intrinsic::aarch64_neon_ld3:
5182 case Intrinsic::aarch64_neon_ld4:
5183 Info.ReadMem = true;
5184 Info.WriteMem = false;
5185 Info.PtrVal = Inst->getArgOperand(0);
5186 break;
5187 case Intrinsic::aarch64_neon_st2:
5188 case Intrinsic::aarch64_neon_st3:
5189 case Intrinsic::aarch64_neon_st4:
5190 Info.ReadMem = false;
5191 Info.WriteMem = true;
5192 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5193 break;
5194 }
5195
5196 switch (Inst->getIntrinsicID()) {
5197 default:
5198 return false;
5199 case Intrinsic::aarch64_neon_ld2:
5200 case Intrinsic::aarch64_neon_st2:
5201 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5202 break;
5203 case Intrinsic::aarch64_neon_ld3:
5204 case Intrinsic::aarch64_neon_st3:
5205 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5206 break;
5207 case Intrinsic::aarch64_neon_ld4:
5208 case Intrinsic::aarch64_neon_st4:
5209 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5210 break;
5211 }
5212 return true;
5213}
5214
5215/// See if \p I should be considered for address type promotion. We check if \p
5216/// I is a sext with right type and used in memory accesses. If it used in a
5217/// "complex" getelementptr, we allow it to be promoted without finding other
5218/// sext instructions that sign extended the same initial value. A getelementptr
5219/// is considered as "complex" if it has more than 2 operands.
5221 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5222 bool Considerable = false;
5223 AllowPromotionWithoutCommonHeader = false;
5224 if (!isa<SExtInst>(&I))
5225 return false;
5226 Type *ConsideredSExtType =
5227 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5228 if (I.getType() != ConsideredSExtType)
5229 return false;
5230 // See if the sext is the one with the right type and used in at least one
5231 // GetElementPtrInst.
5232 for (const User *U : I.users()) {
5233 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5234 Considerable = true;
5235 // A getelementptr is considered as "complex" if it has more than 2
5236 // operands. We will promote a SExt used in such complex GEP as we
5237 // expect some computation to be merged if they are done on 64 bits.
5238 if (GEPInst->getNumOperands() > 2) {
5239 AllowPromotionWithoutCommonHeader = true;
5240 break;
5241 }
5242 }
5243 }
5244 return Considerable;
5245}
5246
5248 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5249 if (!VF.isScalable())
5250 return true;
5251
5252 Type *Ty = RdxDesc.getRecurrenceType();
5254 return false;
5255
5256 switch (RdxDesc.getRecurrenceKind()) {
5257 case RecurKind::Sub:
5259 case RecurKind::Add:
5260 case RecurKind::FAdd:
5261 case RecurKind::And:
5262 case RecurKind::Or:
5263 case RecurKind::Xor:
5264 case RecurKind::SMin:
5265 case RecurKind::SMax:
5266 case RecurKind::UMin:
5267 case RecurKind::UMax:
5268 case RecurKind::FMin:
5269 case RecurKind::FMax:
5270 case RecurKind::FMulAdd:
5271 case RecurKind::AnyOf:
5272 return true;
5273 default:
5274 return false;
5275 }
5276}
5277
5280 FastMathFlags FMF,
5282 // The code-generator is currently not able to handle scalable vectors
5283 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5284 // it. This change will be removed when code-generation for these types is
5285 // sufficiently reliable.
5286 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5287 if (VTy->getElementCount() == ElementCount::getScalable(1))
5289
5290 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5291
5292 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5293 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5294
5295 InstructionCost LegalizationCost = 0;
5296 if (LT.first > 1) {
5297 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5298 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5299 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5300 }
5301
5302 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5303}
5304
5306 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5307 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5308 InstructionCost LegalizationCost = 0;
5309 if (LT.first > 1) {
5310 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5311 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5312 LegalizationCost *= LT.first - 1;
5313 }
5314
5315 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5316 assert(ISD && "Invalid opcode");
5317 // Add the final reduction cost for the legal horizontal reduction
5318 switch (ISD) {
5319 case ISD::ADD:
5320 case ISD::AND:
5321 case ISD::OR:
5322 case ISD::XOR:
5323 case ISD::FADD:
5324 return LegalizationCost + 2;
5325 default:
5327 }
5328}
5329
5332 std::optional<FastMathFlags> FMF,
5334 // The code-generator is currently not able to handle scalable vectors
5335 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5336 // it. This change will be removed when code-generation for these types is
5337 // sufficiently reliable.
5338 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5339 if (VTy->getElementCount() == ElementCount::getScalable(1))
5341
5343 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5344 InstructionCost BaseCost =
5345 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5346 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5347 // end up vectorizing for more computationally intensive loops.
5348 return BaseCost + FixedVTy->getNumElements();
5349 }
5350
5351 if (Opcode != Instruction::FAdd)
5353
5354 auto *VTy = cast<ScalableVectorType>(ValTy);
5356 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5357 Cost *= getMaxNumElements(VTy->getElementCount());
5358 return Cost;
5359 }
5360
5361 if (isa<ScalableVectorType>(ValTy))
5362 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5363
5364 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5365 MVT MTy = LT.second;
5366 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5367 assert(ISD && "Invalid opcode");
5368
5369 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5370 // instructions as twice a normal vector add, plus 1 for each legalization
5371 // step (LT.first). This is the only arithmetic vector reduction operation for
5372 // which we have an instruction.
5373 // OR, XOR and AND costs should match the codegen from:
5374 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5375 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5376 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5377 static const CostTblEntry CostTblNoPairwise[]{
5378 {ISD::ADD, MVT::v8i8, 2},
5379 {ISD::ADD, MVT::v16i8, 2},
5380 {ISD::ADD, MVT::v4i16, 2},
5381 {ISD::ADD, MVT::v8i16, 2},
5382 {ISD::ADD, MVT::v2i32, 2},
5383 {ISD::ADD, MVT::v4i32, 2},
5384 {ISD::ADD, MVT::v2i64, 2},
5385 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5386 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5387 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5388 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5389 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5390 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5391 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5392 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5393 {ISD::XOR, MVT::v16i8, 7},
5394 {ISD::XOR, MVT::v4i16, 4},
5395 {ISD::XOR, MVT::v8i16, 6},
5396 {ISD::XOR, MVT::v2i32, 3},
5397 {ISD::XOR, MVT::v4i32, 5},
5398 {ISD::XOR, MVT::v2i64, 3},
5399 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5400 {ISD::AND, MVT::v16i8, 7},
5401 {ISD::AND, MVT::v4i16, 4},
5402 {ISD::AND, MVT::v8i16, 6},
5403 {ISD::AND, MVT::v2i32, 3},
5404 {ISD::AND, MVT::v4i32, 5},
5405 {ISD::AND, MVT::v2i64, 3},
5406 };
5407 switch (ISD) {
5408 default:
5409 break;
5410 case ISD::FADD:
5411 if (Type *EltTy = ValTy->getScalarType();
5412 // FIXME: For half types without fullfp16 support, this could extend and
5413 // use a fp32 faddp reduction but current codegen unrolls.
5414 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5415 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5416 const unsigned NElts = MTy.getVectorNumElements();
5417 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5418 isPowerOf2_32(NElts))
5419 // Reduction corresponding to series of fadd instructions is lowered to
5420 // series of faddp instructions. faddp has latency/throughput that
5421 // matches fadd instruction and hence, every faddp instruction can be
5422 // considered to have a relative cost = 1 with
5423 // CostKind = TCK_RecipThroughput.
5424 // An faddp will pairwise add vector elements, so the size of input
5425 // vector reduces by half every time, requiring
5426 // #(faddp instructions) = log2_32(NElts).
5427 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5428 }
5429 break;
5430 case ISD::ADD:
5431 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5432 return (LT.first - 1) + Entry->Cost;
5433 break;
5434 case ISD::XOR:
5435 case ISD::AND:
5436 case ISD::OR:
5437 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5438 if (!Entry)
5439 break;
5440 auto *ValVTy = cast<FixedVectorType>(ValTy);
5441 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5442 isPowerOf2_32(ValVTy->getNumElements())) {
5443 InstructionCost ExtraCost = 0;
5444 if (LT.first != 1) {
5445 // Type needs to be split, so there is an extra cost of LT.first - 1
5446 // arithmetic ops.
5447 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5448 MTy.getVectorNumElements());
5449 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5450 ExtraCost *= LT.first - 1;
5451 }
5452 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5453 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5454 return Cost + ExtraCost;
5455 }
5456 break;
5457 }
5458 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5459}
5460
5462 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5463 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5464 EVT VecVT = TLI->getValueType(DL, VecTy);
5465 EVT ResVT = TLI->getValueType(DL, ResTy);
5466
5467 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5468 VecVT.getSizeInBits() >= 64) {
5469 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5470
5471 // The legal cases are:
5472 // UADDLV 8/16/32->32
5473 // UADDLP 32->64
5474 unsigned RevVTSize = ResVT.getSizeInBits();
5475 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5476 RevVTSize <= 32) ||
5477 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5478 RevVTSize <= 32) ||
5479 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5480 RevVTSize <= 64))
5481 return (LT.first - 1) * 2 + 2;
5482 }
5483
5484 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5485 CostKind);
5486}
5487
5490 VectorType *VecTy,
5492 EVT VecVT = TLI->getValueType(DL, VecTy);
5493 EVT ResVT = TLI->getValueType(DL, ResTy);
5494
5495 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple()) {
5496 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5497
5498 // The legal cases with dotprod are
5499 // UDOT 8->32
5500 // Which requires an additional uaddv to sum the i32 values.
5501 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5502 ResVT == MVT::i32)
5503 return LT.first + 2;
5504 }
5505
5506 return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, VecTy, CostKind);
5507}
5508
5512 static const CostTblEntry ShuffleTbl[] = {
5513 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5514 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5515 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5516 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5517 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5518 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5519 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5520 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5521 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5522 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5523 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5524 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5525 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5526 };
5527
5528 // The code-generator is currently not able to handle scalable vectors
5529 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5530 // it. This change will be removed when code-generation for these types is
5531 // sufficiently reliable.
5534
5535 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
5536 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
5537 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5538 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5539 : LT.second;
5540 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5541 InstructionCost LegalizationCost = 0;
5542 if (Index < 0) {
5543 LegalizationCost =
5544 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5546 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5548 }
5549
5550 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5551 // Cost performed on a promoted type.
5552 if (LT.second.getScalarType() == MVT::i1) {
5553 LegalizationCost +=
5554 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5556 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
5558 }
5559 const auto *Entry =
5560 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
5561 assert(Entry && "Illegal Type for Splice");
5562 LegalizationCost += Entry->Cost;
5563 return LegalizationCost * LT.first;
5564}
5565
5567 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5569 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5573
5575 return Invalid;
5576
5577 // Sub opcodes currently only occur in chained cases.
5578 // Independent partial reduction subtractions are still costed as an add
5579 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) ||
5580 OpAExtend == TTI::PR_None)
5581 return Invalid;
5582
5583 // We only support multiply binary operations for now, and for muls we
5584 // require the types being extended to be the same.
5585 // NOTE: For muls AArch64 supports lowering mixed extensions to a usdot but
5586 // only if the i8mm or sve/streaming features are available.
5587 if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB ||
5588 OpBExtend == TTI::PR_None ||
5589 (OpAExtend != OpBExtend && !ST->hasMatMulInt8() &&
5591 return Invalid;
5592 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5593 "Unexpected values for OpBExtend or InputTypeB");
5594
5595 EVT InputEVT = EVT::getEVT(InputTypeA);
5596 EVT AccumEVT = EVT::getEVT(AccumType);
5597
5598 unsigned VFMinValue = VF.getKnownMinValue();
5599
5600 if (VF.isScalable()) {
5602 return Invalid;
5603
5604 // Don't accept a partial reduction if the scaled accumulator is vscale x 1,
5605 // since we can't lower that type.
5606 unsigned Scale =
5607 AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits();
5608 if (VFMinValue == Scale)
5609 return Invalid;
5610 }
5611 if (VF.isFixed() &&
5612 (!ST->isNeonAvailable() || !ST->hasDotProd() || AccumEVT == MVT::i64))
5613 return Invalid;
5614
5615 if (InputEVT == MVT::i8) {
5616 switch (VFMinValue) {
5617 default:
5618 return Invalid;
5619 case 8:
5620 if (AccumEVT == MVT::i32)
5621 Cost *= 2;
5622 else if (AccumEVT != MVT::i64)
5623 return Invalid;
5624 break;
5625 case 16:
5626 if (AccumEVT == MVT::i64)
5627 Cost *= 2;
5628 else if (AccumEVT != MVT::i32)
5629 return Invalid;
5630 break;
5631 }
5632 } else if (InputEVT == MVT::i16) {
5633 // FIXME: Allow i32 accumulator but increase cost, as we would extend
5634 // it to i64.
5635 if (VFMinValue != 8 || AccumEVT != MVT::i64)
5636 return Invalid;
5637 } else
5638 return Invalid;
5639
5640 return Cost;
5641}
5642
5645 VectorType *SrcTy, ArrayRef<int> Mask,
5646 TTI::TargetCostKind CostKind, int Index,
5648 const Instruction *CxtI) const {
5649 assert((Mask.empty() || DstTy->isScalableTy() ||
5650 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
5651 "Expected the Mask to match the return size if given");
5652 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
5653 "Expected the same scalar types");
5654 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
5655
5656 // If we have a Mask, and the LT is being legalized somehow, split the Mask
5657 // into smaller vectors and sum the cost of each shuffle.
5658 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
5659 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
5660 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
5661 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
5662 // Check for LD3/LD4 instructions, which are represented in llvm IR as
5663 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
5664 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
5665 // cost than just the load.
5666 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
5669 return std::max<InstructionCost>(1, LT.first / 4);
5670
5671 // Check for ST3/ST4 instructions, which are represented in llvm IR as
5672 // store(interleaving-shuffle). The shuffle cost could potentially be free,
5673 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
5674 // cost than just the store.
5675 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
5677 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
5679 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
5680 return LT.first;
5681
5682 unsigned TpNumElts = Mask.size();
5683 unsigned LTNumElts = LT.second.getVectorNumElements();
5684 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
5686 LT.second.getVectorElementCount());
5688 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
5689 PreviousCosts;
5690 for (unsigned N = 0; N < NumVecs; N++) {
5691 SmallVector<int> NMask;
5692 // Split the existing mask into chunks of size LTNumElts. Track the source
5693 // sub-vectors to ensure the result has at most 2 inputs.
5694 unsigned Source1 = -1U, Source2 = -1U;
5695 unsigned NumSources = 0;
5696 for (unsigned E = 0; E < LTNumElts; E++) {
5697 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
5699 if (MaskElt < 0) {
5701 continue;
5702 }
5703
5704 // Calculate which source from the input this comes from and whether it
5705 // is new to us.
5706 unsigned Source = MaskElt / LTNumElts;
5707 if (NumSources == 0) {
5708 Source1 = Source;
5709 NumSources = 1;
5710 } else if (NumSources == 1 && Source != Source1) {
5711 Source2 = Source;
5712 NumSources = 2;
5713 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
5714 NumSources++;
5715 }
5716
5717 // Add to the new mask. For the NumSources>2 case these are not correct,
5718 // but are only used for the modular lane number.
5719 if (Source == Source1)
5720 NMask.push_back(MaskElt % LTNumElts);
5721 else if (Source == Source2)
5722 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
5723 else
5724 NMask.push_back(MaskElt % LTNumElts);
5725 }
5726 // Check if we have already generated this sub-shuffle, which means we
5727 // will have already generated the output. For example a <16 x i32> splat
5728 // will be the same sub-splat 4 times, which only needs to be generated
5729 // once and reused.
5730 auto Result =
5731 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
5732 // Check if it was already in the map (already costed).
5733 if (!Result.second)
5734 continue;
5735 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
5736 // getShuffleCost. If not then cost it using the worst case as the number
5737 // of element moves into a new vector.
5738 InstructionCost NCost =
5739 NumSources <= 2
5740 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
5742 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
5743 CxtI)
5744 : LTNumElts;
5745 Result.first->second = NCost;
5746 Cost += NCost;
5747 }
5748 return Cost;
5749 }
5750
5751 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
5752 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
5753 // A subvector extract can be implemented with an ext (or trivial extract, if
5754 // from lane 0). This currently only handles low or high extracts to prevent
5755 // SLP vectorizer regressions.
5756 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
5757 if (LT.second.is128BitVector() &&
5758 cast<FixedVectorType>(SubTp)->getNumElements() ==
5759 LT.second.getVectorNumElements() / 2) {
5760 if (Index == 0)
5761 return 0;
5762 if (Index == (int)LT.second.getVectorNumElements() / 2)
5763 return 1;
5764 }
5766 }
5767 // FIXME: This was added to keep the costs equal when adding DstTys. Update
5768 // the code to handle length-changing shuffles.
5769 if (Kind == TTI::SK_InsertSubvector) {
5770 LT = getTypeLegalizationCost(DstTy);
5771 SrcTy = DstTy;
5772 }
5773
5774 // Segmented shuffle matching.
5775 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
5776 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
5779
5780 FixedVectorType *VTy = cast<FixedVectorType>(SrcTy);
5781 unsigned Segments =
5783 unsigned SegmentElts = VTy->getNumElements() / Segments;
5784
5785 // dupq zd.t, zn.t[idx]
5786 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
5788 isDUPQMask(Mask, Segments, SegmentElts))
5789 return LT.first;
5790
5791 // mov zd.q, vn
5792 if (ST->isSVEorStreamingSVEAvailable() &&
5793 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
5794 return LT.first;
5795 }
5796
5797 // Check for broadcast loads, which are supported by the LD1R instruction.
5798 // In terms of code-size, the shuffle vector is free when a load + dup get
5799 // folded into a LD1R. That's what we check and return here. For performance
5800 // and reciprocal throughput, a LD1R is not completely free. In this case, we
5801 // return the cost for the broadcast below (i.e. 1 for most/all types), so
5802 // that we model the load + dup sequence slightly higher because LD1R is a
5803 // high latency instruction.
5804 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
5805 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
5806 if (IsLoad && LT.second.isVector() &&
5808 LT.second.getVectorElementCount()))
5809 return 0;
5810 }
5811
5812 // If we have 4 elements for the shuffle and a Mask, get the cost straight
5813 // from the perfect shuffle tables.
5814 if (Mask.size() == 4 &&
5815 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
5816 (SrcTy->getScalarSizeInBits() == 16 ||
5817 SrcTy->getScalarSizeInBits() == 32) &&
5818 all_of(Mask, [](int E) { return E < 8; }))
5819 return getPerfectShuffleCost(Mask);
5820
5821 // Check for identity masks, which we can treat as free.
5822 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
5823 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
5824 all_of(enumerate(Mask), [](const auto &M) {
5825 return M.value() < 0 || M.value() == (int)M.index();
5826 }))
5827 return 0;
5828
5829 // Check for other shuffles that are not SK_ kinds but we have native
5830 // instructions for, for example ZIP and UZP.
5831 unsigned Unused;
5832 if (LT.second.isFixedLengthVector() &&
5833 LT.second.getVectorNumElements() == Mask.size() &&
5834 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
5835 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
5836 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
5837 isREVMask(Mask, LT.second.getScalarSizeInBits(),
5838 LT.second.getVectorNumElements(), 16) ||
5839 isREVMask(Mask, LT.second.getScalarSizeInBits(),
5840 LT.second.getVectorNumElements(), 32) ||
5841 isREVMask(Mask, LT.second.getScalarSizeInBits(),
5842 LT.second.getVectorNumElements(), 64) ||
5843 // Check for non-zero lane splats
5844 all_of(drop_begin(Mask),
5845 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
5846 return 1;
5847
5848 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
5849 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
5850 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
5851 static const CostTblEntry ShuffleTbl[] = {
5852 // Broadcast shuffle kinds can be performed with 'dup'.
5853 {TTI::SK_Broadcast, MVT::v8i8, 1},
5854 {TTI::SK_Broadcast, MVT::v16i8, 1},
5855 {TTI::SK_Broadcast, MVT::v4i16, 1},
5856 {TTI::SK_Broadcast, MVT::v8i16, 1},
5857 {TTI::SK_Broadcast, MVT::v2i32, 1},
5858 {TTI::SK_Broadcast, MVT::v4i32, 1},
5859 {TTI::SK_Broadcast, MVT::v2i64, 1},
5860 {TTI::SK_Broadcast, MVT::v4f16, 1},
5861 {TTI::SK_Broadcast, MVT::v8f16, 1},
5862 {TTI::SK_Broadcast, MVT::v4bf16, 1},
5863 {TTI::SK_Broadcast, MVT::v8bf16, 1},
5864 {TTI::SK_Broadcast, MVT::v2f32, 1},
5865 {TTI::SK_Broadcast, MVT::v4f32, 1},
5866 {TTI::SK_Broadcast, MVT::v2f64, 1},
5867 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
5868 // 'zip1/zip2' instructions.
5869 {TTI::SK_Transpose, MVT::v8i8, 1},
5870 {TTI::SK_Transpose, MVT::v16i8, 1},
5871 {TTI::SK_Transpose, MVT::v4i16, 1},
5872 {TTI::SK_Transpose, MVT::v8i16, 1},
5873 {TTI::SK_Transpose, MVT::v2i32, 1},
5874 {TTI::SK_Transpose, MVT::v4i32, 1},
5875 {TTI::SK_Transpose, MVT::v2i64, 1},
5876 {TTI::SK_Transpose, MVT::v4f16, 1},
5877 {TTI::SK_Transpose, MVT::v8f16, 1},
5878 {TTI::SK_Transpose, MVT::v4bf16, 1},
5879 {TTI::SK_Transpose, MVT::v8bf16, 1},
5880 {TTI::SK_Transpose, MVT::v2f32, 1},
5881 {TTI::SK_Transpose, MVT::v4f32, 1},
5882 {TTI::SK_Transpose, MVT::v2f64, 1},
5883 // Select shuffle kinds.
5884 // TODO: handle vXi8/vXi16.
5885 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
5886 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
5887 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
5888 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
5889 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
5890 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
5891 // PermuteSingleSrc shuffle kinds.
5892 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
5893 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
5894 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
5895 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
5896 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
5897 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
5898 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
5899 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
5900 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
5901 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
5902 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
5903 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
5904 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
5905 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
5906 // Reverse can be lowered with `rev`.
5907 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
5908 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
5909 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
5910 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
5911 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
5912 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
5913 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
5914 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
5915 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
5916 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
5917 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
5918 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
5919 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
5920 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
5921 // Splice can all be lowered as `ext`.
5922 {TTI::SK_Splice, MVT::v2i32, 1},
5923 {TTI::SK_Splice, MVT::v4i32, 1},
5924 {TTI::SK_Splice, MVT::v2i64, 1},
5925 {TTI::SK_Splice, MVT::v2f32, 1},
5926 {TTI::SK_Splice, MVT::v4f32, 1},
5927 {TTI::SK_Splice, MVT::v2f64, 1},
5928 {TTI::SK_Splice, MVT::v8f16, 1},
5929 {TTI::SK_Splice, MVT::v8bf16, 1},
5930 {TTI::SK_Splice, MVT::v8i16, 1},
5931 {TTI::SK_Splice, MVT::v16i8, 1},
5932 {TTI::SK_Splice, MVT::v4f16, 1},
5933 {TTI::SK_Splice, MVT::v4bf16, 1},
5934 {TTI::SK_Splice, MVT::v4i16, 1},
5935 {TTI::SK_Splice, MVT::v8i8, 1},
5936 // Broadcast shuffle kinds for scalable vectors
5937 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
5938 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
5939 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
5940 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
5941 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
5942 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
5943 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
5944 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
5945 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
5946 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
5947 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
5948 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
5949 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
5950 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
5951 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
5952 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
5953 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
5954 // Handle the cases for vector.reverse with scalable vectors
5955 {TTI::SK_Reverse, MVT::nxv16i8, 1},
5956 {TTI::SK_Reverse, MVT::nxv8i16, 1},
5957 {TTI::SK_Reverse, MVT::nxv4i32, 1},
5958 {TTI::SK_Reverse, MVT::nxv2i64, 1},
5959 {TTI::SK_Reverse, MVT::nxv2f16, 1},
5960 {TTI::SK_Reverse, MVT::nxv4f16, 1},
5961 {TTI::SK_Reverse, MVT::nxv8f16, 1},
5962 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
5963 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
5964 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
5965 {TTI::SK_Reverse, MVT::nxv2f32, 1},
5966 {TTI::SK_Reverse, MVT::nxv4f32, 1},
5967 {TTI::SK_Reverse, MVT::nxv2f64, 1},
5968 {TTI::SK_Reverse, MVT::nxv16i1, 1},
5969 {TTI::SK_Reverse, MVT::nxv8i1, 1},
5970 {TTI::SK_Reverse, MVT::nxv4i1, 1},
5971 {TTI::SK_Reverse, MVT::nxv2i1, 1},
5972 };
5973 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
5974 return LT.first * Entry->Cost;
5975 }
5976
5977 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
5978 return getSpliceCost(SrcTy, Index, CostKind);
5979
5980 // Inserting a subvector can often be done with either a D, S or H register
5981 // move, so long as the inserted vector is "aligned".
5982 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
5983 LT.second.getSizeInBits() <= 128 && SubTp) {
5984 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
5985 if (SubLT.second.isVector()) {
5986 int NumElts = LT.second.getVectorNumElements();
5987 int NumSubElts = SubLT.second.getVectorNumElements();
5988 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
5989 return SubLT.first;
5990 }
5991 }
5992
5993 // Restore optimal kind.
5994 if (IsExtractSubvector)
5996 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
5997 Args, CxtI);
5998}
5999
6002 const auto &Strides = DenseMap<Value *, const SCEV *>();
6003 for (BasicBlock *BB : TheLoop->blocks()) {
6004 // Scan the instructions in the block and look for addresses that are
6005 // consecutive and decreasing.
6006 for (Instruction &I : *BB) {
6007 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6009 Type *AccessTy = getLoadStoreType(&I);
6010 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
6011 /*ShouldCheckWrap=*/false)
6012 .value_or(0) < 0)
6013 return true;
6014 }
6015 }
6016 }
6017 return false;
6018}
6019
6023 return ST->useFixedOverScalableIfEqualCost();
6024}
6025
6027 return ST->getEpilogueVectorizationMinVF();
6028}
6029
6031 if (!ST->hasSVE())
6032 return false;
6033
6034 // We don't currently support vectorisation with interleaving for SVE - with
6035 // such loops we're better off not using tail-folding. This gives us a chance
6036 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6037 if (TFI->IAI->hasGroups())
6038 return false;
6039
6041 if (TFI->LVL->getReductionVars().size())
6043 if (TFI->LVL->getFixedOrderRecurrences().size())
6045
6046 // We call this to discover whether any load/store pointers in the loop have
6047 // negative strides. This will require extra work to reverse the loop
6048 // predicate, which may be expensive.
6054
6056 Required))
6057 return false;
6058
6059 // Don't tail-fold for tight loops where we would be better off interleaving
6060 // with an unpredicated loop.
6061 unsigned NumInsns = 0;
6062 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6063 NumInsns += BB->sizeWithoutDebug();
6064 }
6065
6066 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6067 return NumInsns >= SVETailFoldInsnThreshold;
6068}
6069
6072 StackOffset BaseOffset, bool HasBaseReg,
6073 int64_t Scale, unsigned AddrSpace) const {
6074 // Scaling factors are not free at all.
6075 // Operands | Rt Latency
6076 // -------------------------------------------
6077 // Rt, [Xn, Xm] | 4
6078 // -------------------------------------------
6079 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6080 // Rt, [Xn, Wm, <extend> #imm] |
6082 AM.BaseGV = BaseGV;
6083 AM.BaseOffs = BaseOffset.getFixed();
6084 AM.HasBaseReg = HasBaseReg;
6085 AM.Scale = Scale;
6086 AM.ScalableOffset = BaseOffset.getScalable();
6087 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6088 // Scale represents reg2 * scale, thus account for 1 if
6089 // it is not equal to 0 or 1.
6090 return AM.Scale != 0 && AM.Scale != 1;
6092}
6093
6095 const Instruction *I) const {
6097 // For the binary operators (e.g. or) we need to be more careful than
6098 // selects, here we only transform them if they are already at a natural
6099 // break point in the code - the end of a block with an unconditional
6100 // terminator.
6101 if (I->getOpcode() == Instruction::Or &&
6102 isa<BranchInst>(I->getNextNode()) &&
6103 cast<BranchInst>(I->getNextNode())->isUnconditional())
6104 return true;
6105
6106 if (I->getOpcode() == Instruction::Add ||
6107 I->getOpcode() == Instruction::Sub)
6108 return true;
6109 }
6111}
6112
6115 const TargetTransformInfo::LSRCost &C2) const {
6116 // AArch64 specific here is adding the number of instructions to the
6117 // comparison (though not as the first consideration, as some targets do)
6118 // along with changing the priority of the base additions.
6119 // TODO: Maybe a more nuanced tradeoff between instruction count
6120 // and number of registers? To be investigated at a later date.
6121 if (EnableLSRCostOpt)
6122 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6123 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6124 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6125 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6126
6128}
6129
6130static bool isSplatShuffle(Value *V) {
6131 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6132 return all_equal(Shuf->getShuffleMask());
6133 return false;
6134}
6135
6136/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6137/// or upper half of the vector elements.
6138static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6139 bool AllowSplat = false) {
6140 // Scalable types can't be extract shuffle vectors.
6141 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6142 return false;
6143
6144 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6145 auto *FullTy = FullV->getType();
6146 auto *HalfTy = HalfV->getType();
6147 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6148 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6149 };
6150
6151 auto extractHalf = [](Value *FullV, Value *HalfV) {
6152 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6153 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6154 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6155 };
6156
6157 ArrayRef<int> M1, M2;
6158 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6159 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6160 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6161 return false;
6162
6163 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6164 // it is not checked as an extract below.
6165 if (AllowSplat && isSplatShuffle(Op1))
6166 S1Op1 = nullptr;
6167 if (AllowSplat && isSplatShuffle(Op2))
6168 S2Op1 = nullptr;
6169
6170 // Check that the operands are half as wide as the result and we extract
6171 // half of the elements of the input vectors.
6172 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6173 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6174 return false;
6175
6176 // Check the mask extracts either the lower or upper half of vector
6177 // elements.
6178 int M1Start = 0;
6179 int M2Start = 0;
6180 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6181 if ((S1Op1 &&
6182 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6183 (S2Op1 &&
6184 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6185 return false;
6186
6187 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6188 (M2Start != 0 && M2Start != (NumElements / 2)))
6189 return false;
6190 if (S1Op1 && S2Op1 && M1Start != M2Start)
6191 return false;
6192
6193 return true;
6194}
6195
6196/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6197/// of the vector elements.
6198static bool areExtractExts(Value *Ext1, Value *Ext2) {
6199 auto areExtDoubled = [](Instruction *Ext) {
6200 return Ext->getType()->getScalarSizeInBits() ==
6201 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6202 };
6203
6204 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6205 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6206 !areExtDoubled(cast<Instruction>(Ext1)) ||
6207 !areExtDoubled(cast<Instruction>(Ext2)))
6208 return false;
6209
6210 return true;
6211}
6212
6213/// Check if Op could be used with vmull_high_p64 intrinsic.
6215 Value *VectorOperand = nullptr;
6216 ConstantInt *ElementIndex = nullptr;
6217 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6218 m_ConstantInt(ElementIndex))) &&
6219 ElementIndex->getValue() == 1 &&
6220 isa<FixedVectorType>(VectorOperand->getType()) &&
6221 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6222}
6223
6224/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6225static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6227}
6228
6230 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6231 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6232 if (!GEP || GEP->getNumOperands() != 2)
6233 return false;
6234
6235 Value *Base = GEP->getOperand(0);
6236 Value *Offsets = GEP->getOperand(1);
6237
6238 // We only care about scalar_base+vector_offsets.
6239 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6240 return false;
6241
6242 // Sink extends that would allow us to use 32-bit offset vectors.
6243 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6244 auto *OffsetsInst = cast<Instruction>(Offsets);
6245 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6246 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6247 Ops.push_back(&GEP->getOperandUse(1));
6248 }
6249
6250 // Sink the GEP.
6251 return true;
6252}
6253
6254/// We want to sink following cases:
6255/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6256/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6258 if (match(Op, m_VScale()))
6259 return true;
6260 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6262 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6263 return true;
6264 }
6265 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6267 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6268 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6269 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6270 return true;
6271 }
6272 return false;
6273}
6274
6275/// Check if sinking \p I's operands to I's basic block is profitable, because
6276/// the operands can be folded into a target instruction, e.g.
6277/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6279 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
6280 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
6281 switch (II->getIntrinsicID()) {
6282 case Intrinsic::aarch64_neon_smull:
6283 case Intrinsic::aarch64_neon_umull:
6284 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6285 /*AllowSplat=*/true)) {
6286 Ops.push_back(&II->getOperandUse(0));
6287 Ops.push_back(&II->getOperandUse(1));
6288 return true;
6289 }
6290 [[fallthrough]];
6291
6292 case Intrinsic::fma:
6293 case Intrinsic::fmuladd:
6294 if (isa<VectorType>(I->getType()) &&
6295 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6296 !ST->hasFullFP16())
6297 return false;
6298 [[fallthrough]];
6299 case Intrinsic::aarch64_neon_sqdmull:
6300 case Intrinsic::aarch64_neon_sqdmulh:
6301 case Intrinsic::aarch64_neon_sqrdmulh:
6302 // Sink splats for index lane variants
6303 if (isSplatShuffle(II->getOperand(0)))
6304 Ops.push_back(&II->getOperandUse(0));
6305 if (isSplatShuffle(II->getOperand(1)))
6306 Ops.push_back(&II->getOperandUse(1));
6307 return !Ops.empty();
6308 case Intrinsic::aarch64_neon_fmlal:
6309 case Intrinsic::aarch64_neon_fmlal2:
6310 case Intrinsic::aarch64_neon_fmlsl:
6311 case Intrinsic::aarch64_neon_fmlsl2:
6312 // Sink splats for index lane variants
6313 if (isSplatShuffle(II->getOperand(1)))
6314 Ops.push_back(&II->getOperandUse(1));
6315 if (isSplatShuffle(II->getOperand(2)))
6316 Ops.push_back(&II->getOperandUse(2));
6317 return !Ops.empty();
6318 case Intrinsic::aarch64_sve_ptest_first:
6319 case Intrinsic::aarch64_sve_ptest_last:
6320 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6321 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6322 Ops.push_back(&II->getOperandUse(0));
6323 return !Ops.empty();
6324 case Intrinsic::aarch64_sme_write_horiz:
6325 case Intrinsic::aarch64_sme_write_vert:
6326 case Intrinsic::aarch64_sme_writeq_horiz:
6327 case Intrinsic::aarch64_sme_writeq_vert: {
6328 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6329 if (!Idx || Idx->getOpcode() != Instruction::Add)
6330 return false;
6331 Ops.push_back(&II->getOperandUse(1));
6332 return true;
6333 }
6334 case Intrinsic::aarch64_sme_read_horiz:
6335 case Intrinsic::aarch64_sme_read_vert:
6336 case Intrinsic::aarch64_sme_readq_horiz:
6337 case Intrinsic::aarch64_sme_readq_vert:
6338 case Intrinsic::aarch64_sme_ld1b_vert:
6339 case Intrinsic::aarch64_sme_ld1h_vert:
6340 case Intrinsic::aarch64_sme_ld1w_vert:
6341 case Intrinsic::aarch64_sme_ld1d_vert:
6342 case Intrinsic::aarch64_sme_ld1q_vert:
6343 case Intrinsic::aarch64_sme_st1b_vert:
6344 case Intrinsic::aarch64_sme_st1h_vert:
6345 case Intrinsic::aarch64_sme_st1w_vert:
6346 case Intrinsic::aarch64_sme_st1d_vert:
6347 case Intrinsic::aarch64_sme_st1q_vert:
6348 case Intrinsic::aarch64_sme_ld1b_horiz:
6349 case Intrinsic::aarch64_sme_ld1h_horiz:
6350 case Intrinsic::aarch64_sme_ld1w_horiz:
6351 case Intrinsic::aarch64_sme_ld1d_horiz:
6352 case Intrinsic::aarch64_sme_ld1q_horiz:
6353 case Intrinsic::aarch64_sme_st1b_horiz:
6354 case Intrinsic::aarch64_sme_st1h_horiz:
6355 case Intrinsic::aarch64_sme_st1w_horiz:
6356 case Intrinsic::aarch64_sme_st1d_horiz:
6357 case Intrinsic::aarch64_sme_st1q_horiz: {
6358 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6359 if (!Idx || Idx->getOpcode() != Instruction::Add)
6360 return false;
6361 Ops.push_back(&II->getOperandUse(3));
6362 return true;
6363 }
6364 case Intrinsic::aarch64_neon_pmull:
6365 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6366 return false;
6367 Ops.push_back(&II->getOperandUse(0));
6368 Ops.push_back(&II->getOperandUse(1));
6369 return true;
6370 case Intrinsic::aarch64_neon_pmull64:
6371 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6372 II->getArgOperand(1)))
6373 return false;
6374 Ops.push_back(&II->getArgOperandUse(0));
6375 Ops.push_back(&II->getArgOperandUse(1));
6376 return true;
6377 case Intrinsic::masked_gather:
6378 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6379 return false;
6380 Ops.push_back(&II->getArgOperandUse(0));
6381 return true;
6382 case Intrinsic::masked_scatter:
6383 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6384 return false;
6385 Ops.push_back(&II->getArgOperandUse(1));
6386 return true;
6387 default:
6388 return false;
6389 }
6390 }
6391
6392 auto ShouldSinkCondition = [](Value *Cond,
6393 SmallVectorImpl<Use *> &Ops) -> bool {
6394 if (!isa<IntrinsicInst>(Cond))
6395 return false;
6396 auto *II = dyn_cast<IntrinsicInst>(Cond);
6397 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6398 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
6399 return false;
6400 if (isa<CmpInst>(II->getOperand(0)))
6401 Ops.push_back(&II->getOperandUse(0));
6402 return true;
6403 };
6404
6405 switch (I->getOpcode()) {
6406 case Instruction::GetElementPtr:
6407 case Instruction::Add:
6408 case Instruction::Sub:
6409 // Sink vscales closer to uses for better isel
6410 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6411 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6412 Ops.push_back(&I->getOperandUse(Op));
6413 return true;
6414 }
6415 }
6416 break;
6417 case Instruction::Select: {
6418 if (!ShouldSinkCondition(I->getOperand(0), Ops))
6419 return false;
6420
6421 Ops.push_back(&I->getOperandUse(0));
6422 return true;
6423 }
6424 case Instruction::Br: {
6425 if (cast<BranchInst>(I)->isUnconditional())
6426 return false;
6427
6428 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops))
6429 return false;
6430
6431 Ops.push_back(&I->getOperandUse(0));
6432 return true;
6433 }
6434 default:
6435 break;
6436 }
6437
6438 if (!I->getType()->isVectorTy())
6439 return false;
6440
6441 switch (I->getOpcode()) {
6442 case Instruction::Sub:
6443 case Instruction::Add: {
6444 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
6445 return false;
6446
6447 // If the exts' operands extract either the lower or upper elements, we
6448 // can sink them too.
6449 auto Ext1 = cast<Instruction>(I->getOperand(0));
6450 auto Ext2 = cast<Instruction>(I->getOperand(1));
6451 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
6452 Ops.push_back(&Ext1->getOperandUse(0));
6453 Ops.push_back(&Ext2->getOperandUse(0));
6454 }
6455
6456 Ops.push_back(&I->getOperandUse(0));
6457 Ops.push_back(&I->getOperandUse(1));
6458
6459 return true;
6460 }
6461 case Instruction::Or: {
6462 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6463 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6464 if (ST->hasNEON()) {
6465 Instruction *OtherAnd, *IA, *IB;
6466 Value *MaskValue;
6467 // MainAnd refers to And instruction that has 'Not' as one of its operands
6468 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
6469 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
6470 m_Instruction(IA)))))) {
6471 if (match(OtherAnd,
6472 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
6473 Instruction *MainAnd = I->getOperand(0) == OtherAnd
6474 ? cast<Instruction>(I->getOperand(1))
6475 : cast<Instruction>(I->getOperand(0));
6476
6477 // Both Ands should be in same basic block as Or
6478 if (I->getParent() != MainAnd->getParent() ||
6479 I->getParent() != OtherAnd->getParent())
6480 return false;
6481
6482 // Non-mask operands of both Ands should also be in same basic block
6483 if (I->getParent() != IA->getParent() ||
6484 I->getParent() != IB->getParent())
6485 return false;
6486
6487 Ops.push_back(
6488 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
6489 Ops.push_back(&I->getOperandUse(0));
6490 Ops.push_back(&I->getOperandUse(1));
6491
6492 return true;
6493 }
6494 }
6495 }
6496
6497 return false;
6498 }
6499 case Instruction::Mul: {
6500 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6501 auto *Ty = cast<VectorType>(V->getType());
6502 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6503 if (Ty->isScalableTy())
6504 return false;
6505
6506 // Indexed variants of Mul exist for i16 and i32 element types only.
6507 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
6508 };
6509
6510 int NumZExts = 0, NumSExts = 0;
6511 for (auto &Op : I->operands()) {
6512 // Make sure we are not already sinking this operand
6513 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
6514 continue;
6515
6516 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
6517 auto *Ext = cast<Instruction>(Op);
6518 auto *ExtOp = Ext->getOperand(0);
6519 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
6520 Ops.push_back(&Ext->getOperandUse(0));
6521 Ops.push_back(&Op);
6522
6523 if (isa<SExtInst>(Ext))
6524 NumSExts++;
6525 else
6526 NumZExts++;
6527
6528 continue;
6529 }
6530
6531 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
6532 if (!Shuffle)
6533 continue;
6534
6535 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
6536 // operand and the s/zext can help create indexed s/umull. This is
6537 // especially useful to prevent i64 mul being scalarized.
6538 if (isSplatShuffle(Shuffle) &&
6539 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
6540 Ops.push_back(&Shuffle->getOperandUse(0));
6541 Ops.push_back(&Op);
6542 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
6543 NumSExts++;
6544 else
6545 NumZExts++;
6546 continue;
6547 }
6548
6549 Value *ShuffleOperand = Shuffle->getOperand(0);
6550 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
6551 if (!Insert)
6552 continue;
6553
6554 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
6555 if (!OperandInstr)
6556 continue;
6557
6558 ConstantInt *ElementConstant =
6559 dyn_cast<ConstantInt>(Insert->getOperand(2));
6560 // Check that the insertelement is inserting into element 0
6561 if (!ElementConstant || !ElementConstant->isZero())
6562 continue;
6563
6564 unsigned Opcode = OperandInstr->getOpcode();
6565 if (Opcode == Instruction::SExt)
6566 NumSExts++;
6567 else if (Opcode == Instruction::ZExt)
6568 NumZExts++;
6569 else {
6570 // If we find that the top bits are known 0, then we can sink and allow
6571 // the backend to generate a umull.
6572 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
6573 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
6574 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
6575 continue;
6576 NumZExts++;
6577 }
6578
6579 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
6580 // the And, just to hoist it again back to the load.
6581 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
6582 Ops.push_back(&Insert->getOperandUse(1));
6583 Ops.push_back(&Shuffle->getOperandUse(0));
6584 Ops.push_back(&Op);
6585 }
6586
6587 // It is profitable to sink if we found two of the same type of extends.
6588 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
6589 return true;
6590
6591 // Otherwise, see if we should sink splats for indexed variants.
6592 if (!ShouldSinkSplatForIndexedVariant(I))
6593 return false;
6594
6595 Ops.clear();
6596 if (isSplatShuffle(I->getOperand(0)))
6597 Ops.push_back(&I->getOperandUse(0));
6598 if (isSplatShuffle(I->getOperand(1)))
6599 Ops.push_back(&I->getOperandUse(1));
6600
6601 return !Ops.empty();
6602 }
6603 case Instruction::FMul: {
6604 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6605 if (I->getType()->isScalableTy())
6606 return false;
6607
6608 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6609 !ST->hasFullFP16())
6610 return false;
6611
6612 // Sink splats for index lane variants
6613 if (isSplatShuffle(I->getOperand(0)))
6614 Ops.push_back(&I->getOperandUse(0));
6615 if (isSplatShuffle(I->getOperand(1)))
6616 Ops.push_back(&I->getOperandUse(1));
6617 return !Ops.empty();
6618 }
6619 default:
6620 return false;
6621 }
6622 return false;
6623}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static std::optional< Instruction * > instCombineSMECntsElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts, const AArch64Subtarget *ST)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
uint32_t Index
uint64_t Size
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
Hexagon Common GEP
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
static unsigned getNumElements(Type *Ty)
#define LLVM_DEBUG(...)
Definition: Debug.h:119
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getVectorInsertExtractBaseCost() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
bool isStreamingSVEAvailable() const
Returns true if the target has access to the streaming-compatible subset of SVE instructions.
unsigned getMaxInterleaveFactor() const
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
TailFoldingOpts getSVETailFoldingDefaultOpts() const
bool useSVEForFixedLengthVectors() const
unsigned getEpilogueVectorizationMinVF() const
unsigned getMinSVEVectorSizeInBits() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
bool prefersVectorizedAddressing() const override
bool preferFixedOverScalableIfEqualCost() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const override
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool enableScalableVectorization() const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
Class for arbitrary precision integers.
Definition: APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:449
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1670
unsigned countLeadingOnes() const
Definition: APInt.h:1624
void negate()
Negate this APInt in place.
Definition: APInt.h:1468
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1041
unsigned logBase2() const
Definition: APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1562
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:233
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:888
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
Definition: BasicTTIImpl.h:459
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
Compute a cost of the given call instruction.
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition: BasicTTIImpl.h:702
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition: BasicTTIImpl.h:774
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:997
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
Definition: BasicTTIImpl.h:548
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition: InstrTypes.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1116
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1348
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1292
unsigned arg_size() const
Definition: InstrTypes.h:1290
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:681
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:707
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:708
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:684
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:682
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:683
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:705
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:686
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:689
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:685
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:687
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:706
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:694
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:688
bool isIntPredicate() const
Definition: InstrTypes.h:785
bool isUnsigned() const
Definition: InstrTypes.h:938
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:23
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1677
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:214
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:154
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
Definition: Constants.cpp:882
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1474
This is an important base class in LLVM.
Definition: Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:674
bool empty() const
Definition: DenseMap.h:119
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:168
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:315
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:312
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition: IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:22
bool allowContract() const
Definition: FMF.h:69
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
unsigned getNumElements() const
Definition: DerivedTypes.h:635
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:949
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2571
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2625
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1107
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2559
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:575
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition: IRBuilder.h:595
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1115
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:488
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1005
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:562
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition: IRBuilder.h:580
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1923
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:834
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2286
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2494
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1714
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2204
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1847
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2593
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1860
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:508
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition: IRBuilder.h:590
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2277
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:207
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
Definition: IRBuilder.cpp:123
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
This instruction inserts a single (scalar) element into a VectorType value.
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
The core instruction combiner logic.
Definition: InstCombiner.h:48
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:388
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:412
BuilderTy & Builder
Definition: InstCombiner.h:61
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
Definition: Instruction.h:317
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:312
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
bool hasGroups() const
Returns true if we have any interleave groups.
Definition: VectorUtils.h:733
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:56
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
An instruction for reading from memory.
Definition: Instructions.h:180
Value * getPointerOperand()
Definition: Instructions.h:259
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition: MapVector.h:56
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1885
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:90
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
bool isNewZA() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:825
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
Definition: DerivedTypes.h:681
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition: SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:476
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:806
void resize(size_type N)
Definition: SmallVector.h:639
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:34
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:44
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:43
An instruction for storing to memory.
Definition: Instructions.h:296
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:710
Class to represent struct types.
Definition: DerivedTypes.h:218
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
virtual const DataLayout & getDataLayout() const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:349
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:273
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:162
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:270
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1866
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
const Use & getOperandUse(unsigned i) const
Definition: User.h:245
Value * getOperand(unsigned i) const
Definition: User.h:232
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
user_iterator user_begin()
Definition: Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:953
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:396
Base class of all SIMD vector types.
Definition: DerivedTypes.h:430
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:695
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:481
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:463
int getNumOccurrences() const
Definition: CommandLine.h:400
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition: TypeSize.h:184
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
constexpr bool isNonZero() const
Definition: TypeSize.h:159
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:172
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition: TypeSize.h:175
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:169
const ParentTy * getParent() const
Definition: ilist_node.h:34
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:862
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:975
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:826
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:1002
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:832
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:960
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:908
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:838
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:751
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition: PatternMatch.h:524
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:862
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:962
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
Definition: PatternMatch.h:560
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:592
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:464
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1121
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
@ Uninitialized
Definition: Threading.h:60
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
Definition: LoopInfo.cpp:1089
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:293
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition: VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition: Error.cpp:167
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2127
InstructionCost Cost
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:378
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:279
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:299
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool isFixedLengthVector() const
Definition: ValueTypes.h:181
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:216
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Type Conversion Cost Table.
Definition: CostTable.h:55