LLVM 21.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
29#include <algorithm>
30#include <optional>
31using namespace llvm;
32using namespace llvm::PatternMatch;
33
34#define DEBUG_TYPE "aarch64tti"
35
36static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
37 cl::init(true), cl::Hidden);
38
40 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
41
42static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
44
45static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
46 cl::init(10), cl::Hidden);
47
48static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
49 cl::init(15), cl::Hidden);
50
52 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
54
56 "call-penalty-sm-change", cl::init(5), cl::Hidden,
58 "Penalty of calling a function that requires a change to PSTATE.SM"));
59
61 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
62 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
63
64static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
65 cl::init(true), cl::Hidden);
66
67static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
68 cl::init(true), cl::Hidden);
69
70// A complete guess as to a reasonable cost.
72 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
73 cl::desc("The cost of a histcnt instruction"));
74
76 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
77 cl::desc("The number of instructions to search for a redundant dmb"));
78
79namespace {
80class TailFoldingOption {
81 // These bitfields will only ever be set to something non-zero in operator=,
82 // when setting the -sve-tail-folding option. This option should always be of
83 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
84 // InitialBits is one of (disabled|all|simple). EnableBits represents
85 // additional flags we're enabling, and DisableBits for those flags we're
86 // disabling. The default flag is tracked in the variable NeedsDefault, since
87 // at the time of setting the option we may not know what the default value
88 // for the CPU is.
89 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
90 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
91 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
92
93 // This value needs to be initialised to true in case the user does not
94 // explicitly set the -sve-tail-folding option.
95 bool NeedsDefault = true;
96
97 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
98
99 void setNeedsDefault(bool V) { NeedsDefault = V; }
100
101 void setEnableBit(TailFoldingOpts Bit) {
102 EnableBits |= Bit;
103 DisableBits &= ~Bit;
104 }
105
106 void setDisableBit(TailFoldingOpts Bit) {
107 EnableBits &= ~Bit;
108 DisableBits |= Bit;
109 }
110
111 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
112 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
113
114 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
115 "Initial bits should only include one of "
116 "(disabled|all|simple|default)");
117 Bits = NeedsDefault ? DefaultBits : InitialBits;
118 Bits |= EnableBits;
119 Bits &= ~DisableBits;
120
121 return Bits;
122 }
123
124 void reportError(std::string Opt) {
125 errs() << "invalid argument '" << Opt
126 << "' to -sve-tail-folding=; the option should be of the form\n"
127 " (disabled|all|default|simple)[+(reductions|recurrences"
128 "|reverse|noreductions|norecurrences|noreverse)]\n";
129 report_fatal_error("Unrecognised tail-folding option");
130 }
131
132public:
133
134 void operator=(const std::string &Val) {
135 // If the user explicitly sets -sve-tail-folding= then treat as an error.
136 if (Val.empty()) {
137 reportError("");
138 return;
139 }
140
141 // Since the user is explicitly setting the option we don't automatically
142 // need the default unless they require it.
143 setNeedsDefault(false);
144
145 SmallVector<StringRef, 4> TailFoldTypes;
146 StringRef(Val).split(TailFoldTypes, '+', -1, false);
147
148 unsigned StartIdx = 1;
149 if (TailFoldTypes[0] == "disabled")
150 setInitialBits(TailFoldingOpts::Disabled);
151 else if (TailFoldTypes[0] == "all")
152 setInitialBits(TailFoldingOpts::All);
153 else if (TailFoldTypes[0] == "default")
154 setNeedsDefault(true);
155 else if (TailFoldTypes[0] == "simple")
156 setInitialBits(TailFoldingOpts::Simple);
157 else {
158 StartIdx = 0;
159 setInitialBits(TailFoldingOpts::Disabled);
160 }
161
162 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
163 if (TailFoldTypes[I] == "reductions")
164 setEnableBit(TailFoldingOpts::Reductions);
165 else if (TailFoldTypes[I] == "recurrences")
166 setEnableBit(TailFoldingOpts::Recurrences);
167 else if (TailFoldTypes[I] == "reverse")
168 setEnableBit(TailFoldingOpts::Reverse);
169 else if (TailFoldTypes[I] == "noreductions")
170 setDisableBit(TailFoldingOpts::Reductions);
171 else if (TailFoldTypes[I] == "norecurrences")
172 setDisableBit(TailFoldingOpts::Recurrences);
173 else if (TailFoldTypes[I] == "noreverse")
174 setDisableBit(TailFoldingOpts::Reverse);
175 else
176 reportError(Val);
177 }
178 }
179
180 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
181 return (getBits(DefaultBits) & Required) == Required;
182 }
183};
184} // namespace
185
186TailFoldingOption TailFoldingOptionLoc;
187
189 "sve-tail-folding",
190 cl::desc(
191 "Control the use of vectorisation using tail-folding for SVE where the"
192 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
193 "\ndisabled (Initial) No loop types will vectorize using "
194 "tail-folding"
195 "\ndefault (Initial) Uses the default tail-folding settings for "
196 "the target CPU"
197 "\nall (Initial) All legal loop types will vectorize using "
198 "tail-folding"
199 "\nsimple (Initial) Use tail-folding for simple loops (not "
200 "reductions or recurrences)"
201 "\nreductions Use tail-folding for loops containing reductions"
202 "\nnoreductions Inverse of above"
203 "\nrecurrences Use tail-folding for loops containing fixed order "
204 "recurrences"
205 "\nnorecurrences Inverse of above"
206 "\nreverse Use tail-folding for loops requiring reversed "
207 "predicates"
208 "\nnoreverse Inverse of above"),
210
211// Experimental option that will only be fully functional when the
212// code-generator is changed to use SVE instead of NEON for all fixed-width
213// operations.
215 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
216
217// Experimental option that will only be fully functional when the cost-model
218// and code-generator have been changed to avoid using scalable vector
219// instructions that are not legal in streaming SVE mode.
221 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
222
223static bool isSMEABIRoutineCall(const CallInst &CI) {
224 const auto *F = CI.getCalledFunction();
225 return F && StringSwitch<bool>(F->getName())
226 .Case("__arm_sme_state", true)
227 .Case("__arm_tpidr2_save", true)
228 .Case("__arm_tpidr2_restore", true)
229 .Case("__arm_za_disable", true)
230 .Default(false);
231}
232
233/// Returns true if the function has explicit operations that can only be
234/// lowered using incompatible instructions for the selected mode. This also
235/// returns true if the function F may use or modify ZA state.
237 for (const BasicBlock &BB : *F) {
238 for (const Instruction &I : BB) {
239 // Be conservative for now and assume that any call to inline asm or to
240 // intrinsics could could result in non-streaming ops (e.g. calls to
241 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
242 // all native LLVM instructions can be lowered to compatible instructions.
243 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
244 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
245 isSMEABIRoutineCall(cast<CallInst>(I))))
246 return true;
247 }
248 }
249 return false;
250}
251
253 StringRef AttributeStr =
254 isMultiversionedFunction(F) ? "fmv-features" : "target-features";
255 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
257 FeatureStr.split(Features, ",");
258 return AArch64::getFMVPriority(Features);
259}
260
262 return F.hasFnAttribute("fmv-features");
263}
264
266 const Function *Callee) const {
267 SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
268
269 // When inlining, we should consider the body of the function, not the
270 // interface.
271 if (CalleeAttrs.hasStreamingBody()) {
272 CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
273 CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
274 }
275
276 if (CalleeAttrs.isNewZA() || CalleeAttrs.isNewZT0())
277 return false;
278
279 if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
280 CallerAttrs.requiresSMChange(CalleeAttrs) ||
281 CallerAttrs.requiresPreservingZT0(CalleeAttrs) ||
282 CallerAttrs.requiresPreservingAllZAState(CalleeAttrs)) {
283 if (hasPossibleIncompatibleOps(Callee))
284 return false;
285 }
286
287 return BaseT::areInlineCompatible(Caller, Callee);
288}
289
291 const Function *Caller, const Function *Callee,
292 const ArrayRef<Type *> &Types) const {
293 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
294 return false;
295
296 // We need to ensure that argument promotion does not attempt to promote
297 // pointers to fixed-length vector types larger than 128 bits like
298 // <8 x float> (and pointers to aggregate types which have such fixed-length
299 // vector type members) into the values of the pointees. Such vector types
300 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
301 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
302 // types can be safely treated as 128-bit NEON types and they cannot be
303 // distinguished in IR.
304 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
305 auto FVTy = dyn_cast<FixedVectorType>(Ty);
306 return FVTy &&
307 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
308 }))
309 return false;
310
311 return true;
312}
313
314unsigned
316 unsigned DefaultCallPenalty) const {
317 // This function calculates a penalty for executing Call in F.
318 //
319 // There are two ways this function can be called:
320 // (1) F:
321 // call from F -> G (the call here is Call)
322 //
323 // For (1), Call.getCaller() == F, so it will always return a high cost if
324 // a streaming-mode change is required (thus promoting the need to inline the
325 // function)
326 //
327 // (2) F:
328 // call from F -> G (the call here is not Call)
329 // G:
330 // call from G -> H (the call here is Call)
331 //
332 // For (2), if after inlining the body of G into F the call to H requires a
333 // streaming-mode change, and the call to G from F would also require a
334 // streaming-mode change, then there is benefit to do the streaming-mode
335 // change only once and avoid inlining of G into F.
336 SMEAttrs FAttrs(*F);
337 SMEAttrs CalleeAttrs(Call);
338 if (FAttrs.requiresSMChange(CalleeAttrs)) {
339 if (F == Call.getCaller()) // (1)
340 return CallPenaltyChangeSM * DefaultCallPenalty;
341 if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2)
342 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
343 }
344
345 return DefaultCallPenalty;
346}
347
352 ST->isNeonAvailable());
353}
354
355/// Calculate the cost of materializing a 64-bit value. This helper
356/// method might only calculate a fraction of a larger immediate. Therefore it
357/// is valid to return a cost of ZERO.
359 // Check if the immediate can be encoded within an instruction.
360 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
361 return 0;
362
363 if (Val < 0)
364 Val = ~Val;
365
366 // Calculate how many moves we will need to materialize this constant.
369 return Insn.size();
370}
371
372/// Calculate the cost of materializing the given constant.
375 assert(Ty->isIntegerTy());
376
377 unsigned BitSize = Ty->getPrimitiveSizeInBits();
378 if (BitSize == 0)
379 return ~0U;
380
381 // Sign-extend all constants to a multiple of 64-bit.
382 APInt ImmVal = Imm;
383 if (BitSize & 0x3f)
384 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
385
386 // Split the constant into 64-bit chunks and calculate the cost for each
387 // chunk.
389 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
390 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
391 int64_t Val = Tmp.getSExtValue();
392 Cost += getIntImmCost(Val);
393 }
394 // We need at least one instruction to materialze the constant.
395 return std::max<InstructionCost>(1, Cost);
396}
397
399 const APInt &Imm, Type *Ty,
401 Instruction *Inst) {
402 assert(Ty->isIntegerTy());
403
404 unsigned BitSize = Ty->getPrimitiveSizeInBits();
405 // There is no cost model for constants with a bit size of 0. Return TCC_Free
406 // here, so that constant hoisting will ignore this constant.
407 if (BitSize == 0)
408 return TTI::TCC_Free;
409
410 unsigned ImmIdx = ~0U;
411 switch (Opcode) {
412 default:
413 return TTI::TCC_Free;
414 case Instruction::GetElementPtr:
415 // Always hoist the base address of a GetElementPtr.
416 if (Idx == 0)
417 return 2 * TTI::TCC_Basic;
418 return TTI::TCC_Free;
419 case Instruction::Store:
420 ImmIdx = 0;
421 break;
422 case Instruction::Add:
423 case Instruction::Sub:
424 case Instruction::Mul:
425 case Instruction::UDiv:
426 case Instruction::SDiv:
427 case Instruction::URem:
428 case Instruction::SRem:
429 case Instruction::And:
430 case Instruction::Or:
431 case Instruction::Xor:
432 case Instruction::ICmp:
433 ImmIdx = 1;
434 break;
435 // Always return TCC_Free for the shift value of a shift instruction.
436 case Instruction::Shl:
437 case Instruction::LShr:
438 case Instruction::AShr:
439 if (Idx == 1)
440 return TTI::TCC_Free;
441 break;
442 case Instruction::Trunc:
443 case Instruction::ZExt:
444 case Instruction::SExt:
445 case Instruction::IntToPtr:
446 case Instruction::PtrToInt:
447 case Instruction::BitCast:
448 case Instruction::PHI:
449 case Instruction::Call:
450 case Instruction::Select:
451 case Instruction::Ret:
452 case Instruction::Load:
453 break;
454 }
455
456 if (Idx == ImmIdx) {
457 int NumConstants = (BitSize + 63) / 64;
459 return (Cost <= NumConstants * TTI::TCC_Basic)
460 ? static_cast<int>(TTI::TCC_Free)
461 : Cost;
462 }
464}
465
468 const APInt &Imm, Type *Ty,
470 assert(Ty->isIntegerTy());
471
472 unsigned BitSize = Ty->getPrimitiveSizeInBits();
473 // There is no cost model for constants with a bit size of 0. Return TCC_Free
474 // here, so that constant hoisting will ignore this constant.
475 if (BitSize == 0)
476 return TTI::TCC_Free;
477
478 // Most (all?) AArch64 intrinsics do not support folding immediates into the
479 // selected instruction, so we compute the materialization cost for the
480 // immediate directly.
481 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
483
484 switch (IID) {
485 default:
486 return TTI::TCC_Free;
487 case Intrinsic::sadd_with_overflow:
488 case Intrinsic::uadd_with_overflow:
489 case Intrinsic::ssub_with_overflow:
490 case Intrinsic::usub_with_overflow:
491 case Intrinsic::smul_with_overflow:
492 case Intrinsic::umul_with_overflow:
493 if (Idx == 1) {
494 int NumConstants = (BitSize + 63) / 64;
496 return (Cost <= NumConstants * TTI::TCC_Basic)
497 ? static_cast<int>(TTI::TCC_Free)
498 : Cost;
499 }
500 break;
501 case Intrinsic::experimental_stackmap:
502 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
503 return TTI::TCC_Free;
504 break;
505 case Intrinsic::experimental_patchpoint_void:
506 case Intrinsic::experimental_patchpoint:
507 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
508 return TTI::TCC_Free;
509 break;
510 case Intrinsic::experimental_gc_statepoint:
511 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
512 return TTI::TCC_Free;
513 break;
514 }
516}
517
520 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
521 if (TyWidth == 32 || TyWidth == 64)
523 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
524 return TTI::PSK_Software;
525}
526
527static bool isUnpackedVectorVT(EVT VecVT) {
528 return VecVT.isScalableVector() &&
530}
531
533 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
534 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
535 unsigned TotalHistCnts = 1;
536
537 unsigned EltSize = EltTy->getScalarSizeInBits();
538 // Only allow (up to 64b) integers or pointers
539 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
541
542 // FIXME: We should be able to generate histcnt for fixed-length vectors
543 // using ptrue with a specific VL.
544 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
545 unsigned EC = VTy->getElementCount().getKnownMinValue();
546 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
548
549 // HistCnt only supports 32b and 64b element types
550 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
551
552 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
554
555 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
556 TotalHistCnts = EC / NaturalVectorWidth;
557 }
558
559 return InstructionCost(BaseHistCntCost * TotalHistCnts);
560}
561
565 // The code-generator is currently not able to handle scalable vectors
566 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
567 // it. This change will be removed when code-generation for these types is
568 // sufficiently reliable.
569 auto *RetTy = ICA.getReturnType();
570 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
571 if (VTy->getElementCount() == ElementCount::getScalable(1))
573
574 switch (ICA.getID()) {
575 case Intrinsic::experimental_vector_histogram_add:
576 if (!ST->hasSVE2())
578 return getHistogramCost(ICA);
579 case Intrinsic::umin:
580 case Intrinsic::umax:
581 case Intrinsic::smin:
582 case Intrinsic::smax: {
583 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
584 MVT::v8i16, MVT::v2i32, MVT::v4i32,
585 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
586 MVT::nxv2i64};
588 // v2i64 types get converted to cmp+bif hence the cost of 2
589 if (LT.second == MVT::v2i64)
590 return LT.first * 2;
591 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
592 return LT.first;
593 break;
594 }
595 case Intrinsic::sadd_sat:
596 case Intrinsic::ssub_sat:
597 case Intrinsic::uadd_sat:
598 case Intrinsic::usub_sat: {
599 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
600 MVT::v8i16, MVT::v2i32, MVT::v4i32,
601 MVT::v2i64};
603 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
604 // need to extend the type, as it uses shr(qadd(shl, shl)).
605 unsigned Instrs =
606 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
607 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
608 return LT.first * Instrs;
609 break;
610 }
611 case Intrinsic::abs: {
612 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
613 MVT::v8i16, MVT::v2i32, MVT::v4i32,
614 MVT::v2i64};
616 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
617 return LT.first;
618 break;
619 }
620 case Intrinsic::bswap: {
621 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
622 MVT::v4i32, MVT::v2i64};
624 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
625 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
626 return LT.first;
627 break;
628 }
629 case Intrinsic::stepvector: {
630 InstructionCost Cost = 1; // Cost of the `index' instruction
632 // Legalisation of illegal vectors involves an `index' instruction plus
633 // (LT.first - 1) vector adds.
634 if (LT.first > 1) {
635 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
636 InstructionCost AddCost =
637 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
638 Cost += AddCost * (LT.first - 1);
639 }
640 return Cost;
641 }
642 case Intrinsic::vector_extract:
643 case Intrinsic::vector_insert: {
644 // If both the vector and subvector types are legal types and the index
645 // is 0, then this should be a no-op or simple operation; return a
646 // relatively low cost.
647
648 // If arguments aren't actually supplied, then we cannot determine the
649 // value of the index. We also want to skip predicate types.
650 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
652 break;
653
654 LLVMContext &C = RetTy->getContext();
655 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
656 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
657 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
658 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
659 // Skip this if either the vector or subvector types are unpacked
660 // SVE types; they may get lowered to stack stores and loads.
661 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
662 break;
663
665 getTLI()->getTypeConversion(C, SubVecVT);
667 getTLI()->getTypeConversion(C, VecVT);
668 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
669 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
670 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
671 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
672 return TTI::TCC_Free;
673 break;
674 }
675 case Intrinsic::bitreverse: {
676 static const CostTblEntry BitreverseTbl[] = {
677 {Intrinsic::bitreverse, MVT::i32, 1},
678 {Intrinsic::bitreverse, MVT::i64, 1},
679 {Intrinsic::bitreverse, MVT::v8i8, 1},
680 {Intrinsic::bitreverse, MVT::v16i8, 1},
681 {Intrinsic::bitreverse, MVT::v4i16, 2},
682 {Intrinsic::bitreverse, MVT::v8i16, 2},
683 {Intrinsic::bitreverse, MVT::v2i32, 2},
684 {Intrinsic::bitreverse, MVT::v4i32, 2},
685 {Intrinsic::bitreverse, MVT::v1i64, 2},
686 {Intrinsic::bitreverse, MVT::v2i64, 2},
687 };
688 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
689 const auto *Entry =
690 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
691 if (Entry) {
692 // Cost Model is using the legal type(i32) that i8 and i16 will be
693 // converted to +1 so that we match the actual lowering cost
694 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
695 TLI->getValueType(DL, RetTy, true) == MVT::i16)
696 return LegalisationCost.first * Entry->Cost + 1;
697
698 return LegalisationCost.first * Entry->Cost;
699 }
700 break;
701 }
702 case Intrinsic::ctpop: {
703 if (!ST->hasNEON()) {
704 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
705 return getTypeLegalizationCost(RetTy).first * 12;
706 }
707 static const CostTblEntry CtpopCostTbl[] = {
708 {ISD::CTPOP, MVT::v2i64, 4},
709 {ISD::CTPOP, MVT::v4i32, 3},
710 {ISD::CTPOP, MVT::v8i16, 2},
711 {ISD::CTPOP, MVT::v16i8, 1},
712 {ISD::CTPOP, MVT::i64, 4},
713 {ISD::CTPOP, MVT::v2i32, 3},
714 {ISD::CTPOP, MVT::v4i16, 2},
715 {ISD::CTPOP, MVT::v8i8, 1},
716 {ISD::CTPOP, MVT::i32, 5},
717 };
719 MVT MTy = LT.second;
720 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
721 // Extra cost of +1 when illegal vector types are legalized by promoting
722 // the integer type.
723 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
724 RetTy->getScalarSizeInBits()
725 ? 1
726 : 0;
727 return LT.first * Entry->Cost + ExtraCost;
728 }
729 break;
730 }
731 case Intrinsic::sadd_with_overflow:
732 case Intrinsic::uadd_with_overflow:
733 case Intrinsic::ssub_with_overflow:
734 case Intrinsic::usub_with_overflow:
735 case Intrinsic::smul_with_overflow:
736 case Intrinsic::umul_with_overflow: {
737 static const CostTblEntry WithOverflowCostTbl[] = {
738 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
739 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
740 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
741 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
742 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
743 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
744 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
745 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
746 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
747 {Intrinsic::usub_with_overflow, MVT::i8, 3},
748 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
749 {Intrinsic::usub_with_overflow, MVT::i16, 3},
750 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
751 {Intrinsic::usub_with_overflow, MVT::i32, 1},
752 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
753 {Intrinsic::usub_with_overflow, MVT::i64, 1},
754 {Intrinsic::smul_with_overflow, MVT::i8, 5},
755 {Intrinsic::umul_with_overflow, MVT::i8, 4},
756 {Intrinsic::smul_with_overflow, MVT::i16, 5},
757 {Intrinsic::umul_with_overflow, MVT::i16, 4},
758 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
759 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
760 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
761 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
762 };
763 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
764 if (MTy.isSimple())
765 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
766 MTy.getSimpleVT()))
767 return Entry->Cost;
768 break;
769 }
770 case Intrinsic::fptosi_sat:
771 case Intrinsic::fptoui_sat: {
772 if (ICA.getArgTypes().empty())
773 break;
774 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
775 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
776 EVT MTy = TLI->getValueType(DL, RetTy);
777 // Check for the legal types, which are where the size of the input and the
778 // output are the same, or we are using cvt f64->i32 or f32->i64.
779 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
780 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
781 LT.second == MVT::v2f64)) {
782 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
783 (LT.second == MVT::f64 && MTy == MVT::i32) ||
784 (LT.second == MVT::f32 && MTy == MVT::i64)))
785 return LT.first;
786 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
787 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
788 MTy.getScalarSizeInBits() == 64)
789 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
790 }
791 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
792 // f32.
793 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
794 return LT.first + getIntrinsicInstrCost(
795 {ICA.getID(),
796 RetTy,
797 {ICA.getArgTypes()[0]->getWithNewType(
798 Type::getFloatTy(RetTy->getContext()))}},
799 CostKind);
800 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
801 (LT.second == MVT::f16 && MTy == MVT::i64) ||
802 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
803 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
804 return LT.first;
805 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
806 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
807 MTy.getScalarSizeInBits() == 32)
808 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
809 // Extending vector types v8f16->v8i32. These current scalarize but the
810 // codegen could be better.
811 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
812 MTy.getScalarSizeInBits() == 64)
813 return MTy.getVectorNumElements() * 3;
814
815 // If we can we use a legal convert followed by a min+max
816 if ((LT.second.getScalarType() == MVT::f32 ||
817 LT.second.getScalarType() == MVT::f64 ||
818 LT.second.getScalarType() == MVT::f16) &&
819 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
820 Type *LegalTy =
821 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
822 if (LT.second.isVector())
823 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
825 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
826 LegalTy, {LegalTy, LegalTy});
828 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
829 LegalTy, {LegalTy, LegalTy});
831 return LT.first * Cost +
832 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
833 : 1);
834 }
835 // Otherwise we need to follow the default expansion that clamps the value
836 // using a float min/max with a fcmp+sel for nan handling when signed.
837 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
838 RetTy = RetTy->getScalarType();
839 if (LT.second.isVector()) {
840 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
841 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
842 }
843 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
845 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
847 Cost +=
848 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
850 if (IsSigned) {
851 Type *CondTy = RetTy->getWithNewBitWidth(1);
852 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
854 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
856 }
857 return LT.first * Cost;
858 }
859 case Intrinsic::fshl:
860 case Intrinsic::fshr: {
861 if (ICA.getArgs().empty())
862 break;
863
864 // TODO: Add handling for fshl where third argument is not a constant.
865 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
866 if (!OpInfoZ.isConstant())
867 break;
868
869 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
870 if (OpInfoZ.isUniform()) {
871 // FIXME: The costs could be lower if the codegen is better.
872 static const CostTblEntry FshlTbl[] = {
873 {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
874 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
875 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
876 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
877 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
878 // to avoid having to duplicate the costs.
879 const auto *Entry =
880 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
881 if (Entry)
882 return LegalisationCost.first * Entry->Cost;
883 }
884
885 auto TyL = getTypeLegalizationCost(RetTy);
886 if (!RetTy->isIntegerTy())
887 break;
888
889 // Estimate cost manually, as types like i8 and i16 will get promoted to
890 // i32 and CostTableLookup will ignore the extra conversion cost.
891 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
892 RetTy->getScalarSizeInBits() < 64) ||
893 (RetTy->getScalarSizeInBits() % 64 != 0);
894 unsigned ExtraCost = HigherCost ? 1 : 0;
895 if (RetTy->getScalarSizeInBits() == 32 ||
896 RetTy->getScalarSizeInBits() == 64)
897 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
898 // extr instruction.
899 else if (HigherCost)
900 ExtraCost = 1;
901 else
902 break;
903 return TyL.first + ExtraCost;
904 }
905 case Intrinsic::get_active_lane_mask: {
906 auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
907 if (RetTy) {
908 EVT RetVT = getTLI()->getValueType(DL, RetTy);
909 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
910 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
911 !getTLI()->isTypeLegal(RetVT)) {
912 // We don't have enough context at this point to determine if the mask
913 // is going to be kept live after the block, which will force the vXi1
914 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
915 // For now, we just assume the vectorizer created this intrinsic and
916 // the result will be the input for a PHI. In this case the cost will
917 // be extremely high for fixed-width vectors.
918 // NOTE: getScalarizationOverhead returns a cost that's far too
919 // pessimistic for the actual generated codegen. In reality there are
920 // two instructions generated per lane.
921 return RetTy->getNumElements() * 2;
922 }
923 }
924 break;
925 }
926 case Intrinsic::experimental_vector_match: {
927 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
928 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
929 unsigned SearchSize = NeedleTy->getNumElements();
930 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
931 // Base cost for MATCH instructions. At least on the Neoverse V2 and
932 // Neoverse V3, these are cheap operations with the same latency as a
933 // vector ADD. In most cases, however, we also need to do an extra DUP.
934 // For fixed-length vectors we currently need an extra five--six
935 // instructions besides the MATCH.
937 if (isa<FixedVectorType>(RetTy))
938 Cost += 10;
939 return Cost;
940 }
941 break;
942 }
943 case Intrinsic::experimental_cttz_elts: {
944 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
945 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
946 // This will consist of a SVE brkb and a cntp instruction. These
947 // typically have the same latency and half the throughput as a vector
948 // add instruction.
949 return 4;
950 }
951 break;
952 }
953 default:
954 break;
955 }
957}
958
959/// The function will remove redundant reinterprets casting in the presence
960/// of the control flow
961static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
962 IntrinsicInst &II) {
964 auto RequiredType = II.getType();
965
966 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
967 assert(PN && "Expected Phi Node!");
968
969 // Don't create a new Phi unless we can remove the old one.
970 if (!PN->hasOneUse())
971 return std::nullopt;
972
973 for (Value *IncValPhi : PN->incoming_values()) {
974 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
975 if (!Reinterpret ||
976 Reinterpret->getIntrinsicID() !=
977 Intrinsic::aarch64_sve_convert_to_svbool ||
978 RequiredType != Reinterpret->getArgOperand(0)->getType())
979 return std::nullopt;
980 }
981
982 // Create the new Phi
983 IC.Builder.SetInsertPoint(PN);
984 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
985 Worklist.push_back(PN);
986
987 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
988 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
989 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
990 Worklist.push_back(Reinterpret);
991 }
992
993 // Cleanup Phi Node and reinterprets
994 return IC.replaceInstUsesWith(II, NPN);
995}
996
997// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
998// => (binop (pred) (from_svbool _) (from_svbool _))
999//
1000// The above transformation eliminates a `to_svbool` in the predicate
1001// operand of bitwise operation `binop` by narrowing the vector width of
1002// the operation. For example, it would convert a `<vscale x 16 x i1>
1003// and` into a `<vscale x 4 x i1> and`. This is profitable because
1004// to_svbool must zero the new lanes during widening, whereas
1005// from_svbool is free.
1006static std::optional<Instruction *>
1008 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1009 if (!BinOp)
1010 return std::nullopt;
1011
1012 auto IntrinsicID = BinOp->getIntrinsicID();
1013 switch (IntrinsicID) {
1014 case Intrinsic::aarch64_sve_and_z:
1015 case Intrinsic::aarch64_sve_bic_z:
1016 case Intrinsic::aarch64_sve_eor_z:
1017 case Intrinsic::aarch64_sve_nand_z:
1018 case Intrinsic::aarch64_sve_nor_z:
1019 case Intrinsic::aarch64_sve_orn_z:
1020 case Intrinsic::aarch64_sve_orr_z:
1021 break;
1022 default:
1023 return std::nullopt;
1024 }
1025
1026 auto BinOpPred = BinOp->getOperand(0);
1027 auto BinOpOp1 = BinOp->getOperand(1);
1028 auto BinOpOp2 = BinOp->getOperand(2);
1029
1030 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1031 if (!PredIntr ||
1032 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1033 return std::nullopt;
1034
1035 auto PredOp = PredIntr->getOperand(0);
1036 auto PredOpTy = cast<VectorType>(PredOp->getType());
1037 if (PredOpTy != II.getType())
1038 return std::nullopt;
1039
1040 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1041 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1042 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1043 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1044 if (BinOpOp1 == BinOpOp2)
1045 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1046 else
1047 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1048 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1049
1050 auto NarrowedBinOp =
1051 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1052 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1053}
1054
1055static std::optional<Instruction *>
1057 // If the reinterpret instruction operand is a PHI Node
1058 if (isa<PHINode>(II.getArgOperand(0)))
1059 return processPhiNode(IC, II);
1060
1061 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1062 return BinOpCombine;
1063
1064 // Ignore converts to/from svcount_t.
1065 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1066 isa<TargetExtType>(II.getType()))
1067 return std::nullopt;
1068
1069 SmallVector<Instruction *, 32> CandidatesForRemoval;
1070 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1071
1072 const auto *IVTy = cast<VectorType>(II.getType());
1073
1074 // Walk the chain of conversions.
1075 while (Cursor) {
1076 // If the type of the cursor has fewer lanes than the final result, zeroing
1077 // must take place, which breaks the equivalence chain.
1078 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1079 if (CursorVTy->getElementCount().getKnownMinValue() <
1080 IVTy->getElementCount().getKnownMinValue())
1081 break;
1082
1083 // If the cursor has the same type as I, it is a viable replacement.
1084 if (Cursor->getType() == IVTy)
1085 EarliestReplacement = Cursor;
1086
1087 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1088
1089 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1090 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1091 Intrinsic::aarch64_sve_convert_to_svbool ||
1092 IntrinsicCursor->getIntrinsicID() ==
1093 Intrinsic::aarch64_sve_convert_from_svbool))
1094 break;
1095
1096 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1097 Cursor = IntrinsicCursor->getOperand(0);
1098 }
1099
1100 // If no viable replacement in the conversion chain was found, there is
1101 // nothing to do.
1102 if (!EarliestReplacement)
1103 return std::nullopt;
1104
1105 return IC.replaceInstUsesWith(II, EarliestReplacement);
1106}
1107
1108static bool isAllActivePredicate(Value *Pred) {
1109 // Look through convert.from.svbool(convert.to.svbool(...) chain.
1110 Value *UncastedPred;
1111 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1112 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1113 m_Value(UncastedPred)))))
1114 // If the predicate has the same or less lanes than the uncasted
1115 // predicate then we know the casting has no effect.
1116 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1117 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1118 Pred = UncastedPred;
1119
1120 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1121 m_ConstantInt<AArch64SVEPredPattern::all>()));
1122}
1123
1124// Simplify unary operation where predicate has all inactive lanes by replacing
1125// instruction with its operand
1126static std::optional<Instruction *>
1128 bool hasInactiveVector) {
1129 int PredOperand = hasInactiveVector ? 1 : 0;
1130 int ReplaceOperand = hasInactiveVector ? 0 : 1;
1131 if (match(II.getOperand(PredOperand), m_ZeroInt())) {
1132 IC.replaceInstUsesWith(II, II.getOperand(ReplaceOperand));
1133 return IC.eraseInstFromFunction(II);
1134 }
1135 return std::nullopt;
1136}
1137
1138// Simplify unary operation where predicate has all inactive lanes or
1139// replace unused first operand with undef when all lanes are active
1140static std::optional<Instruction *>
1142 if (isAllActivePredicate(II.getOperand(1)) &&
1143 !isa<llvm::UndefValue>(II.getOperand(0)) &&
1144 !isa<llvm::PoisonValue>(II.getOperand(0))) {
1145 Value *Undef = llvm::UndefValue::get(II.getType());
1146 return IC.replaceOperand(II, 0, Undef);
1147 }
1148 return instCombineSVENoActiveReplace(IC, II, true);
1149}
1150
1151// Erase unary operation where predicate has all inactive lanes
1152static std::optional<Instruction *>
1154 int PredPos) {
1155 if (match(II.getOperand(PredPos), m_ZeroInt())) {
1156 return IC.eraseInstFromFunction(II);
1157 }
1158 return std::nullopt;
1159}
1160
1161// Simplify operation where predicate has all inactive lanes by replacing
1162// instruction with zeroed object
1163static std::optional<Instruction *>
1165 if (match(II.getOperand(0), m_ZeroInt())) {
1166 Constant *Node;
1167 Type *RetTy = II.getType();
1168 if (RetTy->isStructTy()) {
1169 auto StructT = cast<StructType>(RetTy);
1170 auto VecT = StructT->getElementType(0);
1172 for (unsigned i = 0; i < StructT->getNumElements(); i++) {
1173 ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
1174 : ConstantInt::get(VecT, 0));
1175 }
1176 Node = ConstantStruct::get(StructT, ZerVec);
1177 } else
1178 Node = RetTy->isFPOrFPVectorTy() ? ConstantFP::get(RetTy, 0.0)
1179 : ConstantInt::get(II.getType(), 0);
1180
1182 return IC.eraseInstFromFunction(II);
1183 }
1184 return std::nullopt;
1185}
1186
1187static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1188 IntrinsicInst &II) {
1189 // svsel(ptrue, x, y) => x
1190 auto *OpPredicate = II.getOperand(0);
1191 if (isAllActivePredicate(OpPredicate))
1192 return IC.replaceInstUsesWith(II, II.getOperand(1));
1193
1194 auto Select =
1195 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1196 return IC.replaceInstUsesWith(II, Select);
1197}
1198
1199static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1200 IntrinsicInst &II) {
1201 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1202 if (!Pg)
1203 return std::nullopt;
1204
1205 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1206 return std::nullopt;
1207
1208 const auto PTruePattern =
1209 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1210 if (PTruePattern != AArch64SVEPredPattern::vl1)
1211 return std::nullopt;
1212
1213 // The intrinsic is inserting into lane zero so use an insert instead.
1214 auto *IdxTy = Type::getInt64Ty(II.getContext());
1215 auto *Insert = InsertElementInst::Create(
1216 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1217 Insert->insertBefore(II.getIterator());
1218 Insert->takeName(&II);
1219
1220 return IC.replaceInstUsesWith(II, Insert);
1221}
1222
1223static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1224 IntrinsicInst &II) {
1225 // Replace DupX with a regular IR splat.
1226 auto *RetTy = cast<ScalableVectorType>(II.getType());
1227 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1228 II.getArgOperand(0));
1229 Splat->takeName(&II);
1230 return IC.replaceInstUsesWith(II, Splat);
1231}
1232
1233static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1234 IntrinsicInst &II) {
1235 LLVMContext &Ctx = II.getContext();
1236
1237 // Replace by zero constant when all lanes are inactive
1238 if (auto II_NA = instCombineSVENoActiveZero(IC, II))
1239 return II_NA;
1240
1241 // Check that the predicate is all active
1242 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
1243 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1244 return std::nullopt;
1245
1246 const auto PTruePattern =
1247 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1248 if (PTruePattern != AArch64SVEPredPattern::all)
1249 return std::nullopt;
1250
1251 // Check that we have a compare of zero..
1252 auto *SplatValue =
1253 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
1254 if (!SplatValue || !SplatValue->isZero())
1255 return std::nullopt;
1256
1257 // ..against a dupq
1258 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1259 if (!DupQLane ||
1260 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1261 return std::nullopt;
1262
1263 // Where the dupq is a lane 0 replicate of a vector insert
1264 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1265 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1266 return std::nullopt;
1267
1268 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1269 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1270 return std::nullopt;
1271
1272 // Where the vector insert is a fixed constant vector insert into undef at
1273 // index zero
1274 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1275 return std::nullopt;
1276
1277 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1278 return std::nullopt;
1279
1280 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1281 if (!ConstVec)
1282 return std::nullopt;
1283
1284 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1285 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1286 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1287 return std::nullopt;
1288
1289 unsigned NumElts = VecTy->getNumElements();
1290 unsigned PredicateBits = 0;
1291
1292 // Expand intrinsic operands to a 16-bit byte level predicate
1293 for (unsigned I = 0; I < NumElts; ++I) {
1294 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1295 if (!Arg)
1296 return std::nullopt;
1297 if (!Arg->isZero())
1298 PredicateBits |= 1 << (I * (16 / NumElts));
1299 }
1300
1301 // If all bits are zero bail early with an empty predicate
1302 if (PredicateBits == 0) {
1303 auto *PFalse = Constant::getNullValue(II.getType());
1304 PFalse->takeName(&II);
1305 return IC.replaceInstUsesWith(II, PFalse);
1306 }
1307
1308 // Calculate largest predicate type used (where byte predicate is largest)
1309 unsigned Mask = 8;
1310 for (unsigned I = 0; I < 16; ++I)
1311 if ((PredicateBits & (1 << I)) != 0)
1312 Mask |= (I % 8);
1313
1314 unsigned PredSize = Mask & -Mask;
1315 auto *PredType = ScalableVectorType::get(
1316 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1317
1318 // Ensure all relevant bits are set
1319 for (unsigned I = 0; I < 16; I += PredSize)
1320 if ((PredicateBits & (1 << I)) == 0)
1321 return std::nullopt;
1322
1323 auto *PTruePat =
1324 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1325 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1326 {PredType}, {PTruePat});
1327 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1328 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1329 auto *ConvertFromSVBool =
1330 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1331 {II.getType()}, {ConvertToSVBool});
1332
1333 ConvertFromSVBool->takeName(&II);
1334 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1335}
1336
1337static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1338 IntrinsicInst &II) {
1339 Value *Pg = II.getArgOperand(0);
1340 Value *Vec = II.getArgOperand(1);
1341 auto IntrinsicID = II.getIntrinsicID();
1342 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1343
1344 // lastX(splat(X)) --> X
1345 if (auto *SplatVal = getSplatValue(Vec))
1346 return IC.replaceInstUsesWith(II, SplatVal);
1347
1348 // If x and/or y is a splat value then:
1349 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1350 Value *LHS, *RHS;
1351 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1352 if (isSplatValue(LHS) || isSplatValue(RHS)) {
1353 auto *OldBinOp = cast<BinaryOperator>(Vec);
1354 auto OpC = OldBinOp->getOpcode();
1355 auto *NewLHS =
1356 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1357 auto *NewRHS =
1358 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1360 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
1361 return IC.replaceInstUsesWith(II, NewBinOp);
1362 }
1363 }
1364
1365 auto *C = dyn_cast<Constant>(Pg);
1366 if (IsAfter && C && C->isNullValue()) {
1367 // The intrinsic is extracting lane 0 so use an extract instead.
1368 auto *IdxTy = Type::getInt64Ty(II.getContext());
1369 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
1370 Extract->insertBefore(II.getIterator());
1371 Extract->takeName(&II);
1372 return IC.replaceInstUsesWith(II, Extract);
1373 }
1374
1375 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1376 if (!IntrPG)
1377 return std::nullopt;
1378
1379 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1380 return std::nullopt;
1381
1382 const auto PTruePattern =
1383 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1384
1385 // Can the intrinsic's predicate be converted to a known constant index?
1386 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
1387 if (!MinNumElts)
1388 return std::nullopt;
1389
1390 unsigned Idx = MinNumElts - 1;
1391 // Increment the index if extracting the element after the last active
1392 // predicate element.
1393 if (IsAfter)
1394 ++Idx;
1395
1396 // Ignore extracts whose index is larger than the known minimum vector
1397 // length. NOTE: This is an artificial constraint where we prefer to
1398 // maintain what the user asked for until an alternative is proven faster.
1399 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1400 if (Idx >= PgVTy->getMinNumElements())
1401 return std::nullopt;
1402
1403 // The intrinsic is extracting a fixed lane so use an extract instead.
1404 auto *IdxTy = Type::getInt64Ty(II.getContext());
1405 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
1406 Extract->insertBefore(II.getIterator());
1407 Extract->takeName(&II);
1408 return IC.replaceInstUsesWith(II, Extract);
1409}
1410
1411static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1412 IntrinsicInst &II) {
1413 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1414 // integer variant across a variety of micro-architectures. Replace scalar
1415 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1416 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1417 // depending on the micro-architecture, but has been observed as generally
1418 // being faster, particularly when the CLAST[AB] op is a loop-carried
1419 // dependency.
1420 Value *Pg = II.getArgOperand(0);
1421 Value *Fallback = II.getArgOperand(1);
1422 Value *Vec = II.getArgOperand(2);
1423 Type *Ty = II.getType();
1424
1425 if (!Ty->isIntegerTy())
1426 return std::nullopt;
1427
1428 Type *FPTy;
1429 switch (cast<IntegerType>(Ty)->getBitWidth()) {
1430 default:
1431 return std::nullopt;
1432 case 16:
1433 FPTy = IC.Builder.getHalfTy();
1434 break;
1435 case 32:
1436 FPTy = IC.Builder.getFloatTy();
1437 break;
1438 case 64:
1439 FPTy = IC.Builder.getDoubleTy();
1440 break;
1441 }
1442
1443 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
1444 auto *FPVTy = VectorType::get(
1445 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
1446 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
1447 auto *FPII = IC.Builder.CreateIntrinsic(
1448 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1449 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
1450 return IC.replaceInstUsesWith(II, FPIItoInt);
1451}
1452
1453static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
1454 IntrinsicInst &II) {
1455 LLVMContext &Ctx = II.getContext();
1456 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
1457 // can work with RDFFR_PP for ptest elimination.
1458 auto *AllPat =
1459 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1460 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1461 {II.getType()}, {AllPat});
1462 auto *RDFFR =
1463 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
1464 RDFFR->takeName(&II);
1465 return IC.replaceInstUsesWith(II, RDFFR);
1466}
1467
1468static std::optional<Instruction *>
1470 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1471
1472 if (Pattern == AArch64SVEPredPattern::all) {
1473 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
1474 auto *VScale = IC.Builder.CreateVScale(StepVal);
1475 VScale->takeName(&II);
1476 return IC.replaceInstUsesWith(II, VScale);
1477 }
1478
1479 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1480
1481 return MinNumElts && NumElts >= MinNumElts
1482 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
1483 II, ConstantInt::get(II.getType(), MinNumElts)))
1484 : std::nullopt;
1485}
1486
1487static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1488 IntrinsicInst &II) {
1489 Value *PgVal = II.getArgOperand(0);
1490 Value *OpVal = II.getArgOperand(1);
1491
1492 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1493 // Later optimizations prefer this form.
1494 if (PgVal == OpVal &&
1495 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1496 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1497 Value *Ops[] = {PgVal, OpVal};
1498 Type *Tys[] = {PgVal->getType()};
1499
1500 auto *PTest =
1501 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1502 PTest->takeName(&II);
1503
1504 return IC.replaceInstUsesWith(II, PTest);
1505 }
1506
1507 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1508 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1509
1510 if (!Pg || !Op)
1511 return std::nullopt;
1512
1513 Intrinsic::ID OpIID = Op->getIntrinsicID();
1514
1515 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1516 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1517 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1518 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1519 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1520
1521 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1522
1523 PTest->takeName(&II);
1524 return IC.replaceInstUsesWith(II, PTest);
1525 }
1526
1527 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1528 // Later optimizations may rewrite sequence to use the flag-setting variant
1529 // of instruction X to remove PTEST.
1530 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1531 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1532 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1533 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1534 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1535 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1536 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1537 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1538 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1539 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1540 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1541 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1542 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1543 Value *Ops[] = {Pg->getArgOperand(0), Pg};
1544 Type *Tys[] = {Pg->getType()};
1545
1546 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1547 PTest->takeName(&II);
1548
1549 return IC.replaceInstUsesWith(II, PTest);
1550 }
1551
1552 return std::nullopt;
1553}
1554
1555template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1556static std::optional<Instruction *>
1558 bool MergeIntoAddendOp) {
1559 Value *P = II.getOperand(0);
1560 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1561 if (MergeIntoAddendOp) {
1562 AddendOp = II.getOperand(1);
1563 Mul = II.getOperand(2);
1564 } else {
1565 AddendOp = II.getOperand(2);
1566 Mul = II.getOperand(1);
1567 }
1568
1569 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1570 m_Value(MulOp1))))
1571 return std::nullopt;
1572
1573 if (!Mul->hasOneUse())
1574 return std::nullopt;
1575
1576 Instruction *FMFSource = nullptr;
1577 if (II.getType()->isFPOrFPVectorTy()) {
1578 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1579 // Stop the combine when the flags on the inputs differ in case dropping
1580 // flags would lead to us missing out on more beneficial optimizations.
1581 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1582 return std::nullopt;
1583 if (!FAddFlags.allowContract())
1584 return std::nullopt;
1585 FMFSource = &II;
1586 }
1587
1588 CallInst *Res;
1589 if (MergeIntoAddendOp)
1590 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1591 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1592 else
1593 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1594 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1595
1596 return IC.replaceInstUsesWith(II, Res);
1597}
1598
1599static std::optional<Instruction *>
1601 Value *Pred = II.getOperand(0);
1602 Value *PtrOp = II.getOperand(1);
1603 Type *VecTy = II.getType();
1604
1605 // Replace by zero constant when all lanes are inactive
1606 if (auto II_NA = instCombineSVENoActiveZero(IC, II))
1607 return II_NA;
1608
1609 if (isAllActivePredicate(Pred)) {
1610 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
1611 Load->copyMetadata(II);
1612 return IC.replaceInstUsesWith(II, Load);
1613 }
1614
1615 CallInst *MaskedLoad =
1616 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
1617 Pred, ConstantAggregateZero::get(VecTy));
1618 MaskedLoad->copyMetadata(II);
1619 return IC.replaceInstUsesWith(II, MaskedLoad);
1620}
1621
1622static std::optional<Instruction *>
1624 Value *VecOp = II.getOperand(0);
1625 Value *Pred = II.getOperand(1);
1626 Value *PtrOp = II.getOperand(2);
1627
1628 if (isAllActivePredicate(Pred)) {
1629 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
1630 Store->copyMetadata(II);
1631 return IC.eraseInstFromFunction(II);
1632 }
1633
1634 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
1635 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
1636 MaskedStore->copyMetadata(II);
1637 return IC.eraseInstFromFunction(II);
1638}
1639
1641 switch (Intrinsic) {
1642 case Intrinsic::aarch64_sve_fmul_u:
1643 return Instruction::BinaryOps::FMul;
1644 case Intrinsic::aarch64_sve_fadd_u:
1645 return Instruction::BinaryOps::FAdd;
1646 case Intrinsic::aarch64_sve_fsub_u:
1647 return Instruction::BinaryOps::FSub;
1648 default:
1649 return Instruction::BinaryOpsEnd;
1650 }
1651}
1652
1653static std::optional<Instruction *>
1655 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
1656 if (II.isStrictFP())
1657 return std::nullopt;
1658
1659 auto *OpPredicate = II.getOperand(0);
1660 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1661 if (BinOpCode == Instruction::BinaryOpsEnd ||
1662 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1663 m_ConstantInt<AArch64SVEPredPattern::all>())))
1664 return std::nullopt;
1665 auto BinOp = IC.Builder.CreateBinOpFMF(
1666 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
1667 return IC.replaceInstUsesWith(II, BinOp);
1668}
1669
1670// Canonicalise operations that take an all active predicate (e.g. sve.add ->
1671// sve.add_u).
1672static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
1673 Intrinsic::ID IID) {
1674 auto *OpPredicate = II.getOperand(0);
1675 if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1676 m_ConstantInt<AArch64SVEPredPattern::all>())))
1677 return std::nullopt;
1678
1679 auto *Mod = II.getModule();
1680 auto *NewDecl = Intrinsic::getOrInsertDeclaration(Mod, IID, {II.getType()});
1681 II.setCalledFunction(NewDecl);
1682
1683 return &II;
1684}
1685
1686// Simplify operations where predicate has all inactive lanes or try to replace
1687// with _u form when all lanes are active
1688static std::optional<Instruction *>
1690 Intrinsic::ID IID) {
1691 if (match(II.getOperand(0), m_ZeroInt())) {
1692 // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1693 // inactive for sv[func]_m
1694 return IC.replaceInstUsesWith(II, II.getOperand(1));
1695 }
1696 return instCombineSVEAllActive(II, IID);
1697}
1698
1699static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1700 IntrinsicInst &II) {
1701 if (auto II_U =
1702 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
1703 return II_U;
1704 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1705 Intrinsic::aarch64_sve_mla>(
1706 IC, II, true))
1707 return MLA;
1708 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1709 Intrinsic::aarch64_sve_mad>(
1710 IC, II, false))
1711 return MAD;
1712 return std::nullopt;
1713}
1714
1715static std::optional<Instruction *>
1717 if (auto II_U =
1718 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
1719 return II_U;
1720 if (auto FMLA =
1721 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1722 Intrinsic::aarch64_sve_fmla>(IC, II,
1723 true))
1724 return FMLA;
1725 if (auto FMAD =
1726 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1727 Intrinsic::aarch64_sve_fmad>(IC, II,
1728 false))
1729 return FMAD;
1730 if (auto FMLA =
1731 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1732 Intrinsic::aarch64_sve_fmla>(IC, II,
1733 true))
1734 return FMLA;
1735 return std::nullopt;
1736}
1737
1738static std::optional<Instruction *>
1740 if (auto FMLA =
1741 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1742 Intrinsic::aarch64_sve_fmla>(IC, II,
1743 true))
1744 return FMLA;
1745 if (auto FMAD =
1746 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1747 Intrinsic::aarch64_sve_fmad>(IC, II,
1748 false))
1749 return FMAD;
1750 if (auto FMLA_U =
1751 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1752 Intrinsic::aarch64_sve_fmla_u>(
1753 IC, II, true))
1754 return FMLA_U;
1755 return instCombineSVEVectorBinOp(IC, II);
1756}
1757
1758static std::optional<Instruction *>
1760 if (auto II_U =
1761 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
1762 return II_U;
1763 if (auto FMLS =
1764 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1765 Intrinsic::aarch64_sve_fmls>(IC, II,
1766 true))
1767 return FMLS;
1768 if (auto FMSB =
1769 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1770 Intrinsic::aarch64_sve_fnmsb>(
1771 IC, II, false))
1772 return FMSB;
1773 if (auto FMLS =
1774 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1775 Intrinsic::aarch64_sve_fmls>(IC, II,
1776 true))
1777 return FMLS;
1778 return std::nullopt;
1779}
1780
1781static std::optional<Instruction *>
1783 if (auto FMLS =
1784 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1785 Intrinsic::aarch64_sve_fmls>(IC, II,
1786 true))
1787 return FMLS;
1788 if (auto FMSB =
1789 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1790 Intrinsic::aarch64_sve_fnmsb>(
1791 IC, II, false))
1792 return FMSB;
1793 if (auto FMLS_U =
1794 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1795 Intrinsic::aarch64_sve_fmls_u>(
1796 IC, II, true))
1797 return FMLS_U;
1798 return instCombineSVEVectorBinOp(IC, II);
1799}
1800
1801static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1802 IntrinsicInst &II) {
1803 if (auto II_U =
1804 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
1805 return II_U;
1806 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1807 Intrinsic::aarch64_sve_mls>(
1808 IC, II, true))
1809 return MLS;
1810 return std::nullopt;
1811}
1812
1813static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1815 Intrinsic::ID IID) {
1816 auto *OpPredicate = II.getOperand(0);
1817 auto *OpMultiplicand = II.getOperand(1);
1818 auto *OpMultiplier = II.getOperand(2);
1819
1820 // Return true if a given instruction is a unit splat value, false otherwise.
1821 auto IsUnitSplat = [](auto *I) {
1822 auto *SplatValue = getSplatValue(I);
1823 if (!SplatValue)
1824 return false;
1825 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1826 };
1827
1828 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1829 // with a unit splat value, false otherwise.
1830 auto IsUnitDup = [](auto *I) {
1831 auto *IntrI = dyn_cast<IntrinsicInst>(I);
1832 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1833 return false;
1834
1835 auto *SplatValue = IntrI->getOperand(2);
1836 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1837 };
1838
1839 if (IsUnitSplat(OpMultiplier)) {
1840 // [f]mul pg %n, (dupx 1) => %n
1841 OpMultiplicand->takeName(&II);
1842 return IC.replaceInstUsesWith(II, OpMultiplicand);
1843 } else if (IsUnitDup(OpMultiplier)) {
1844 // [f]mul pg %n, (dup pg 1) => %n
1845 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1846 auto *DupPg = DupInst->getOperand(1);
1847 // TODO: this is naive. The optimization is still valid if DupPg
1848 // 'encompasses' OpPredicate, not only if they're the same predicate.
1849 if (OpPredicate == DupPg) {
1850 OpMultiplicand->takeName(&II);
1851 return IC.replaceInstUsesWith(II, OpMultiplicand);
1852 }
1853 }
1854
1855 return instCombineSVEVectorBinOp(IC, II);
1856}
1857
1858static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1859 IntrinsicInst &II) {
1860 Value *UnpackArg = II.getArgOperand(0);
1861 auto *RetTy = cast<ScalableVectorType>(II.getType());
1862 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1863 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1864
1865 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1866 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1867 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1868 ScalarArg =
1869 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1870 Value *NewVal =
1871 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1872 NewVal->takeName(&II);
1873 return IC.replaceInstUsesWith(II, NewVal);
1874 }
1875
1876 return std::nullopt;
1877}
1878static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1879 IntrinsicInst &II) {
1880 auto *OpVal = II.getOperand(0);
1881 auto *OpIndices = II.getOperand(1);
1882 VectorType *VTy = cast<VectorType>(II.getType());
1883
1884 // Check whether OpIndices is a constant splat value < minimal element count
1885 // of result.
1886 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1887 if (!SplatValue ||
1888 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1889 return std::nullopt;
1890
1891 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1892 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1893 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
1894 auto *VectorSplat =
1895 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1896
1897 VectorSplat->takeName(&II);
1898 return IC.replaceInstUsesWith(II, VectorSplat);
1899}
1900
1901static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
1902 IntrinsicInst &II) {
1903 Value *A, *B;
1904 Type *RetTy = II.getType();
1905 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
1906 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
1907
1908 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
1909 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
1910 if ((match(II.getArgOperand(0),
1911 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) &&
1912 match(II.getArgOperand(1),
1913 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) ||
1914 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
1915 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
1916 auto *TyA = cast<ScalableVectorType>(A->getType());
1917 if (TyA == B->getType() &&
1919 auto *SubVec = IC.Builder.CreateInsertVector(
1921 auto *ConcatVec = IC.Builder.CreateInsertVector(
1922 RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements()));
1923 ConcatVec->takeName(&II);
1924 return IC.replaceInstUsesWith(II, ConcatVec);
1925 }
1926 }
1927
1928 return std::nullopt;
1929}
1930
1931static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1932 IntrinsicInst &II) {
1933 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1934 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1935 Value *A, *B;
1936 if (match(II.getArgOperand(0),
1937 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1938 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1939 m_Specific(A), m_Specific(B))))
1940 return IC.replaceInstUsesWith(
1941 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1942
1943 return std::nullopt;
1944}
1945
1946static std::optional<Instruction *>
1948 Value *Mask = II.getOperand(0);
1949 Value *BasePtr = II.getOperand(1);
1950 Value *Index = II.getOperand(2);
1951 Type *Ty = II.getType();
1952 Value *PassThru = ConstantAggregateZero::get(Ty);
1953
1954 // Replace by zero constant when all lanes are inactive
1955 if (auto II_NA = instCombineSVENoActiveZero(IC, II))
1956 return II_NA;
1957
1958 // Contiguous gather => masked load.
1959 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1960 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1961 Value *IndexBase;
1962 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1963 m_Value(IndexBase), m_SpecificInt(1)))) {
1964 Align Alignment =
1965 BasePtr->getPointerAlignment(II.getDataLayout());
1966
1967 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1968 BasePtr, IndexBase);
1969 CallInst *MaskedLoad =
1970 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1971 MaskedLoad->takeName(&II);
1972 return IC.replaceInstUsesWith(II, MaskedLoad);
1973 }
1974
1975 return std::nullopt;
1976}
1977
1978static std::optional<Instruction *>
1980 Value *Val = II.getOperand(0);
1981 Value *Mask = II.getOperand(1);
1982 Value *BasePtr = II.getOperand(2);
1983 Value *Index = II.getOperand(3);
1984 Type *Ty = Val->getType();
1985
1986 // Contiguous scatter => masked store.
1987 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1988 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1989 Value *IndexBase;
1990 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1991 m_Value(IndexBase), m_SpecificInt(1)))) {
1992 Align Alignment =
1993 BasePtr->getPointerAlignment(II.getDataLayout());
1994
1995 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1996 BasePtr, IndexBase);
1997 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1998
1999 return IC.eraseInstFromFunction(II);
2000 }
2001
2002 return std::nullopt;
2003}
2004
2005static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2006 IntrinsicInst &II) {
2007 Type *Int32Ty = IC.Builder.getInt32Ty();
2008 Value *Pred = II.getOperand(0);
2009 Value *Vec = II.getOperand(1);
2010 Value *DivVec = II.getOperand(2);
2011
2012 Value *SplatValue = getSplatValue(DivVec);
2013 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2014 if (!SplatConstantInt)
2015 return std::nullopt;
2016
2017 APInt Divisor = SplatConstantInt->getValue();
2018 const int64_t DivisorValue = Divisor.getSExtValue();
2019 if (DivisorValue == -1)
2020 return std::nullopt;
2021 if (DivisorValue == 1)
2022 IC.replaceInstUsesWith(II, Vec);
2023
2024 if (Divisor.isPowerOf2()) {
2025 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2026 auto ASRD = IC.Builder.CreateIntrinsic(
2027 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2028 return IC.replaceInstUsesWith(II, ASRD);
2029 }
2030 if (Divisor.isNegatedPowerOf2()) {
2031 Divisor.negate();
2032 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2033 auto ASRD = IC.Builder.CreateIntrinsic(
2034 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2035 auto NEG = IC.Builder.CreateIntrinsic(
2036 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2037 return IC.replaceInstUsesWith(II, NEG);
2038 }
2039
2040 return std::nullopt;
2041}
2042
2043bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2044 size_t VecSize = Vec.size();
2045 if (VecSize == 1)
2046 return true;
2047 if (!isPowerOf2_64(VecSize))
2048 return false;
2049 size_t HalfVecSize = VecSize / 2;
2050
2051 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2052 RHS != Vec.end(); LHS++, RHS++) {
2053 if (*LHS != nullptr && *RHS != nullptr) {
2054 if (*LHS == *RHS)
2055 continue;
2056 else
2057 return false;
2058 }
2059 if (!AllowPoison)
2060 return false;
2061 if (*LHS == nullptr && *RHS != nullptr)
2062 *LHS = *RHS;
2063 }
2064
2065 Vec.resize(HalfVecSize);
2066 SimplifyValuePattern(Vec, AllowPoison);
2067 return true;
2068}
2069
2070// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2071// to dupqlane(f64(C)) where C is A concatenated with B
2072static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2073 IntrinsicInst &II) {
2074 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2075 if (!match(II.getOperand(0),
2076 m_Intrinsic<Intrinsic::vector_insert>(
2077 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2078 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2079 return std::nullopt;
2080 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2081
2082 // Insert the scalars into a container ordered by InsertElement index
2083 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2084 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2085 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2086 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2087 CurrentInsertElt = InsertElt->getOperand(0);
2088 }
2089
2090 bool AllowPoison =
2091 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2092 if (!SimplifyValuePattern(Elts, AllowPoison))
2093 return std::nullopt;
2094
2095 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2096 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2097 for (size_t I = 0; I < Elts.size(); I++) {
2098 if (Elts[I] == nullptr)
2099 continue;
2100 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2101 IC.Builder.getInt64(I));
2102 }
2103 if (InsertEltChain == nullptr)
2104 return std::nullopt;
2105
2106 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2107 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2108 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2109 // be narrowed back to the original type.
2110 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2111 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2112 IIScalableTy->getMinNumElements() /
2113 PatternWidth;
2114
2115 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2116 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2117 auto *WideShuffleMaskTy =
2118 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2119
2120 auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0));
2121 auto InsertSubvector = IC.Builder.CreateInsertVector(
2122 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
2123 auto WideBitcast =
2124 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2125 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2126 auto WideShuffle = IC.Builder.CreateShuffleVector(
2127 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2128 auto NarrowBitcast =
2129 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2130
2131 return IC.replaceInstUsesWith(II, NarrowBitcast);
2132}
2133
2134static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2135 IntrinsicInst &II) {
2136 Value *A = II.getArgOperand(0);
2137 Value *B = II.getArgOperand(1);
2138 if (A == B)
2139 return IC.replaceInstUsesWith(II, A);
2140
2141 return std::nullopt;
2142}
2143
2144static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2145 IntrinsicInst &II) {
2146 Value *Pred = II.getOperand(0);
2147 Value *Vec = II.getOperand(1);
2148 Value *Shift = II.getOperand(2);
2149
2150 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2151 Value *AbsPred, *MergedValue;
2152 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2153 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2154 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2155 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2156
2157 return std::nullopt;
2158
2159 // Transform is valid if any of the following are true:
2160 // * The ABS merge value is an undef or non-negative
2161 // * The ABS predicate is all active
2162 // * The ABS predicate and the SRSHL predicates are the same
2163 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2164 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2165 return std::nullopt;
2166
2167 // Only valid when the shift amount is non-negative, otherwise the rounding
2168 // behaviour of SRSHL cannot be ignored.
2169 if (!match(Shift, m_NonNegative()))
2170 return std::nullopt;
2171
2172 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2173 {II.getType()}, {Pred, Vec, Shift});
2174
2175 return IC.replaceInstUsesWith(II, LSL);
2176}
2177
2178static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2179 IntrinsicInst &II) {
2180 Value *Vec = II.getOperand(0);
2181
2182 if (getSplatValue(Vec) == II.getOperand(1))
2183 return IC.replaceInstUsesWith(II, Vec);
2184
2185 return std::nullopt;
2186}
2187
2188static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2189 IntrinsicInst &II) {
2190 // If this barrier is post-dominated by identical one we can remove it
2191 auto *NI = II.getNextNonDebugInstruction();
2192 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2193 auto CanSkipOver = [](Instruction *I) {
2194 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2195 };
2196 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2197 auto *NIBB = NI->getParent();
2198 NI = NI->getNextNonDebugInstruction();
2199 if (!NI) {
2200 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2201 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2202 else
2203 break;
2204 }
2205 }
2206 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2207 if (NextII && II.isIdenticalTo(NextII))
2208 return IC.eraseInstFromFunction(II);
2209
2210 return std::nullopt;
2211}
2212
2213std::optional<Instruction *>
2215 IntrinsicInst &II) const {
2216 Intrinsic::ID IID = II.getIntrinsicID();
2217 switch (IID) {
2218 default:
2219 break;
2220 case Intrinsic::aarch64_dmb:
2221 return instCombineDMB(IC, II);
2222 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
2223 case Intrinsic::aarch64_sve_fcvt_f16f32:
2224 case Intrinsic::aarch64_sve_fcvt_f16f64:
2225 case Intrinsic::aarch64_sve_fcvt_f32f16:
2226 case Intrinsic::aarch64_sve_fcvt_f32f64:
2227 case Intrinsic::aarch64_sve_fcvt_f64f16:
2228 case Intrinsic::aarch64_sve_fcvt_f64f32:
2229 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
2230 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
2231 case Intrinsic::aarch64_sve_fcvtx_f32f64:
2232 case Intrinsic::aarch64_sve_fcvtzs:
2233 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
2234 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
2235 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
2236 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
2237 case Intrinsic::aarch64_sve_fcvtzu:
2238 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
2239 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
2240 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
2241 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
2242 case Intrinsic::aarch64_sve_scvtf:
2243 case Intrinsic::aarch64_sve_scvtf_f16i32:
2244 case Intrinsic::aarch64_sve_scvtf_f16i64:
2245 case Intrinsic::aarch64_sve_scvtf_f32i64:
2246 case Intrinsic::aarch64_sve_scvtf_f64i32:
2247 case Intrinsic::aarch64_sve_ucvtf:
2248 case Intrinsic::aarch64_sve_ucvtf_f16i32:
2249 case Intrinsic::aarch64_sve_ucvtf_f16i64:
2250 case Intrinsic::aarch64_sve_ucvtf_f32i64:
2251 case Intrinsic::aarch64_sve_ucvtf_f64i32:
2253 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
2254 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
2255 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
2256 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
2257 return instCombineSVENoActiveReplace(IC, II, true);
2258 case Intrinsic::aarch64_sve_st1_scatter:
2259 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
2260 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
2261 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
2262 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
2263 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
2264 case Intrinsic::aarch64_sve_st1dq:
2265 case Intrinsic::aarch64_sve_st1q_scatter_index:
2266 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
2267 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
2268 case Intrinsic::aarch64_sve_st1wq:
2269 case Intrinsic::aarch64_sve_stnt1:
2270 case Intrinsic::aarch64_sve_stnt1_scatter:
2271 case Intrinsic::aarch64_sve_stnt1_scatter_index:
2272 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
2273 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
2274 return instCombineSVENoActiveUnaryErase(IC, II, 1);
2275 case Intrinsic::aarch64_sve_st2:
2276 case Intrinsic::aarch64_sve_st2q:
2277 return instCombineSVENoActiveUnaryErase(IC, II, 2);
2278 case Intrinsic::aarch64_sve_st3:
2279 case Intrinsic::aarch64_sve_st3q:
2280 return instCombineSVENoActiveUnaryErase(IC, II, 3);
2281 case Intrinsic::aarch64_sve_st4:
2282 case Intrinsic::aarch64_sve_st4q:
2283 return instCombineSVENoActiveUnaryErase(IC, II, 4);
2284 case Intrinsic::aarch64_sve_addqv:
2285 case Intrinsic::aarch64_sve_and_z:
2286 case Intrinsic::aarch64_sve_bic_z:
2287 case Intrinsic::aarch64_sve_brka_z:
2288 case Intrinsic::aarch64_sve_brkb_z:
2289 case Intrinsic::aarch64_sve_brkn_z:
2290 case Intrinsic::aarch64_sve_brkpa_z:
2291 case Intrinsic::aarch64_sve_brkpb_z:
2292 case Intrinsic::aarch64_sve_cntp:
2293 case Intrinsic::aarch64_sve_compact:
2294 case Intrinsic::aarch64_sve_eor_z:
2295 case Intrinsic::aarch64_sve_eorv:
2296 case Intrinsic::aarch64_sve_eorqv:
2297 case Intrinsic::aarch64_sve_nand_z:
2298 case Intrinsic::aarch64_sve_nor_z:
2299 case Intrinsic::aarch64_sve_orn_z:
2300 case Intrinsic::aarch64_sve_orr_z:
2301 case Intrinsic::aarch64_sve_orv:
2302 case Intrinsic::aarch64_sve_orqv:
2303 case Intrinsic::aarch64_sve_pnext:
2304 case Intrinsic::aarch64_sve_rdffr_z:
2305 case Intrinsic::aarch64_sve_saddv:
2306 case Intrinsic::aarch64_sve_uaddv:
2307 case Intrinsic::aarch64_sve_umaxv:
2308 case Intrinsic::aarch64_sve_umaxqv:
2309 case Intrinsic::aarch64_sve_cmpeq:
2310 case Intrinsic::aarch64_sve_cmpeq_wide:
2311 case Intrinsic::aarch64_sve_cmpge:
2312 case Intrinsic::aarch64_sve_cmpge_wide:
2313 case Intrinsic::aarch64_sve_cmpgt:
2314 case Intrinsic::aarch64_sve_cmpgt_wide:
2315 case Intrinsic::aarch64_sve_cmphi:
2316 case Intrinsic::aarch64_sve_cmphi_wide:
2317 case Intrinsic::aarch64_sve_cmphs:
2318 case Intrinsic::aarch64_sve_cmphs_wide:
2319 case Intrinsic::aarch64_sve_cmple_wide:
2320 case Intrinsic::aarch64_sve_cmplo_wide:
2321 case Intrinsic::aarch64_sve_cmpls_wide:
2322 case Intrinsic::aarch64_sve_cmplt_wide:
2323 case Intrinsic::aarch64_sve_facge:
2324 case Intrinsic::aarch64_sve_facgt:
2325 case Intrinsic::aarch64_sve_fcmpeq:
2326 case Intrinsic::aarch64_sve_fcmpge:
2327 case Intrinsic::aarch64_sve_fcmpgt:
2328 case Intrinsic::aarch64_sve_fcmpne:
2329 case Intrinsic::aarch64_sve_fcmpuo:
2330 case Intrinsic::aarch64_sve_ld1_gather:
2331 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
2332 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
2333 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
2334 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
2335 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
2336 case Intrinsic::aarch64_sve_ld1q_gather_index:
2337 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
2338 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
2339 case Intrinsic::aarch64_sve_ld1ro:
2340 case Intrinsic::aarch64_sve_ld1rq:
2341 case Intrinsic::aarch64_sve_ld1udq:
2342 case Intrinsic::aarch64_sve_ld1uwq:
2343 case Intrinsic::aarch64_sve_ld2_sret:
2344 case Intrinsic::aarch64_sve_ld2q_sret:
2345 case Intrinsic::aarch64_sve_ld3_sret:
2346 case Intrinsic::aarch64_sve_ld3q_sret:
2347 case Intrinsic::aarch64_sve_ld4_sret:
2348 case Intrinsic::aarch64_sve_ld4q_sret:
2349 case Intrinsic::aarch64_sve_ldff1:
2350 case Intrinsic::aarch64_sve_ldff1_gather:
2351 case Intrinsic::aarch64_sve_ldff1_gather_index:
2352 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
2353 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
2354 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
2355 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
2356 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
2357 case Intrinsic::aarch64_sve_ldnf1:
2358 case Intrinsic::aarch64_sve_ldnt1:
2359 case Intrinsic::aarch64_sve_ldnt1_gather:
2360 case Intrinsic::aarch64_sve_ldnt1_gather_index:
2361 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
2362 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
2363 return instCombineSVENoActiveZero(IC, II);
2364 case Intrinsic::aarch64_sve_prf:
2365 case Intrinsic::aarch64_sve_prfb_gather_index:
2366 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
2367 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
2368 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
2369 case Intrinsic::aarch64_sve_prfd_gather_index:
2370 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
2371 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
2372 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
2373 case Intrinsic::aarch64_sve_prfh_gather_index:
2374 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
2375 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
2376 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
2377 case Intrinsic::aarch64_sve_prfw_gather_index:
2378 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
2379 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
2380 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
2381 return instCombineSVENoActiveUnaryErase(IC, II, 0);
2382 case Intrinsic::aarch64_neon_fmaxnm:
2383 case Intrinsic::aarch64_neon_fminnm:
2384 return instCombineMaxMinNM(IC, II);
2385 case Intrinsic::aarch64_sve_convert_from_svbool:
2386 return instCombineConvertFromSVBool(IC, II);
2387 case Intrinsic::aarch64_sve_dup:
2388 return instCombineSVEDup(IC, II);
2389 case Intrinsic::aarch64_sve_dup_x:
2390 return instCombineSVEDupX(IC, II);
2391 case Intrinsic::aarch64_sve_cmpne:
2392 case Intrinsic::aarch64_sve_cmpne_wide:
2393 return instCombineSVECmpNE(IC, II);
2394 case Intrinsic::aarch64_sve_rdffr:
2395 return instCombineRDFFR(IC, II);
2396 case Intrinsic::aarch64_sve_lasta:
2397 case Intrinsic::aarch64_sve_lastb:
2398 return instCombineSVELast(IC, II);
2399 case Intrinsic::aarch64_sve_clasta_n:
2400 case Intrinsic::aarch64_sve_clastb_n:
2401 return instCombineSVECondLast(IC, II);
2402 case Intrinsic::aarch64_sve_cntd:
2403 return instCombineSVECntElts(IC, II, 2);
2404 case Intrinsic::aarch64_sve_cntw:
2405 return instCombineSVECntElts(IC, II, 4);
2406 case Intrinsic::aarch64_sve_cnth:
2407 return instCombineSVECntElts(IC, II, 8);
2408 case Intrinsic::aarch64_sve_cntb:
2409 return instCombineSVECntElts(IC, II, 16);
2410 case Intrinsic::aarch64_sve_ptest_any:
2411 case Intrinsic::aarch64_sve_ptest_first:
2412 case Intrinsic::aarch64_sve_ptest_last:
2413 return instCombineSVEPTest(IC, II);
2414 case Intrinsic::aarch64_sve_fabd:
2415 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
2416 case Intrinsic::aarch64_sve_fadd:
2417 return instCombineSVEVectorFAdd(IC, II);
2418 case Intrinsic::aarch64_sve_fadd_u:
2419 return instCombineSVEVectorFAddU(IC, II);
2420 case Intrinsic::aarch64_sve_fdiv:
2421 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
2422 case Intrinsic::aarch64_sve_fmax:
2423 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
2424 case Intrinsic::aarch64_sve_fmaxnm:
2425 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
2426 case Intrinsic::aarch64_sve_fmin:
2427 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
2428 case Intrinsic::aarch64_sve_fminnm:
2429 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
2430 case Intrinsic::aarch64_sve_fmla:
2431 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
2432 case Intrinsic::aarch64_sve_fmls:
2433 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
2434 case Intrinsic::aarch64_sve_fmul:
2435 if (auto II_U =
2436 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u))
2437 return II_U;
2438 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2439 case Intrinsic::aarch64_sve_fmul_u:
2440 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2441 case Intrinsic::aarch64_sve_fmulx:
2442 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
2443 case Intrinsic::aarch64_sve_fnmla:
2444 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
2445 case Intrinsic::aarch64_sve_fnmls:
2446 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
2447 case Intrinsic::aarch64_sve_fsub:
2448 return instCombineSVEVectorFSub(IC, II);
2449 case Intrinsic::aarch64_sve_fsub_u:
2450 return instCombineSVEVectorFSubU(IC, II);
2451 case Intrinsic::aarch64_sve_add:
2452 return instCombineSVEVectorAdd(IC, II);
2453 case Intrinsic::aarch64_sve_add_u:
2454 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2455 Intrinsic::aarch64_sve_mla_u>(
2456 IC, II, true);
2457 case Intrinsic::aarch64_sve_mla:
2458 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
2459 case Intrinsic::aarch64_sve_mls:
2460 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
2461 case Intrinsic::aarch64_sve_mul:
2462 if (auto II_U =
2463 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u))
2464 return II_U;
2465 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2466 case Intrinsic::aarch64_sve_mul_u:
2467 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2468 case Intrinsic::aarch64_sve_sabd:
2469 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
2470 case Intrinsic::aarch64_sve_smax:
2471 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
2472 case Intrinsic::aarch64_sve_smin:
2473 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
2474 case Intrinsic::aarch64_sve_smulh:
2475 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
2476 case Intrinsic::aarch64_sve_sub:
2477 return instCombineSVEVectorSub(IC, II);
2478 case Intrinsic::aarch64_sve_sub_u:
2479 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2480 Intrinsic::aarch64_sve_mls_u>(
2481 IC, II, true);
2482 case Intrinsic::aarch64_sve_uabd:
2483 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
2484 case Intrinsic::aarch64_sve_umax:
2485 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
2486 case Intrinsic::aarch64_sve_umin:
2487 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
2488 case Intrinsic::aarch64_sve_umulh:
2489 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
2490 case Intrinsic::aarch64_sve_asr:
2491 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
2492 case Intrinsic::aarch64_sve_lsl:
2493 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
2494 case Intrinsic::aarch64_sve_lsr:
2495 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
2496 case Intrinsic::aarch64_sve_and:
2497 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
2498 case Intrinsic::aarch64_sve_bic:
2499 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
2500 case Intrinsic::aarch64_sve_eor:
2501 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
2502 case Intrinsic::aarch64_sve_orr:
2503 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
2504 case Intrinsic::aarch64_sve_sqsub:
2505 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
2506 case Intrinsic::aarch64_sve_uqsub:
2507 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
2508 case Intrinsic::aarch64_sve_tbl:
2509 return instCombineSVETBL(IC, II);
2510 case Intrinsic::aarch64_sve_uunpkhi:
2511 case Intrinsic::aarch64_sve_uunpklo:
2512 case Intrinsic::aarch64_sve_sunpkhi:
2513 case Intrinsic::aarch64_sve_sunpklo:
2514 return instCombineSVEUnpack(IC, II);
2515 case Intrinsic::aarch64_sve_uzp1:
2516 return instCombineSVEUzp1(IC, II);
2517 case Intrinsic::aarch64_sve_zip1:
2518 case Intrinsic::aarch64_sve_zip2:
2519 return instCombineSVEZip(IC, II);
2520 case Intrinsic::aarch64_sve_ld1_gather_index:
2521 return instCombineLD1GatherIndex(IC, II);
2522 case Intrinsic::aarch64_sve_st1_scatter_index:
2523 return instCombineST1ScatterIndex(IC, II);
2524 case Intrinsic::aarch64_sve_ld1:
2525 return instCombineSVELD1(IC, II, DL);
2526 case Intrinsic::aarch64_sve_st1:
2527 return instCombineSVEST1(IC, II, DL);
2528 case Intrinsic::aarch64_sve_sdiv:
2529 return instCombineSVESDIV(IC, II);
2530 case Intrinsic::aarch64_sve_sel:
2531 return instCombineSVESel(IC, II);
2532 case Intrinsic::aarch64_sve_srshl:
2533 return instCombineSVESrshl(IC, II);
2534 case Intrinsic::aarch64_sve_dupq_lane:
2535 return instCombineSVEDupqLane(IC, II);
2536 case Intrinsic::aarch64_sve_insr:
2537 return instCombineSVEInsr(IC, II);
2538 }
2539
2540 return std::nullopt;
2541}
2542
2544 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2545 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2546 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2547 SimplifyAndSetOp) const {
2548 switch (II.getIntrinsicID()) {
2549 default:
2550 break;
2551 case Intrinsic::aarch64_neon_fcvtxn:
2552 case Intrinsic::aarch64_neon_rshrn:
2553 case Intrinsic::aarch64_neon_sqrshrn:
2554 case Intrinsic::aarch64_neon_sqrshrun:
2555 case Intrinsic::aarch64_neon_sqshrn:
2556 case Intrinsic::aarch64_neon_sqshrun:
2557 case Intrinsic::aarch64_neon_sqxtn:
2558 case Intrinsic::aarch64_neon_sqxtun:
2559 case Intrinsic::aarch64_neon_uqrshrn:
2560 case Intrinsic::aarch64_neon_uqshrn:
2561 case Intrinsic::aarch64_neon_uqxtn:
2562 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2563 break;
2564 }
2565
2566 return std::nullopt;
2567}
2568
2570 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2572}
2573
2576 switch (K) {
2578 return TypeSize::getFixed(64);
2580 if (ST->useSVEForFixedLengthVectors() &&
2582 return TypeSize::getFixed(
2583 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2584 else if (ST->isNeonAvailable())
2585 return TypeSize::getFixed(128);
2586 else
2587 return TypeSize::getFixed(0);
2589 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2591 return TypeSize::getScalable(128);
2592 else
2593 return TypeSize::getScalable(0);
2594 }
2595 llvm_unreachable("Unsupported register kind");
2596}
2597
2598bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
2600 Type *SrcOverrideTy) {
2601 // A helper that returns a vector type from the given type. The number of
2602 // elements in type Ty determines the vector width.
2603 auto toVectorTy = [&](Type *ArgTy) {
2604 return VectorType::get(ArgTy->getScalarType(),
2605 cast<VectorType>(DstTy)->getElementCount());
2606 };
2607
2608 // Exit early if DstTy is not a vector type whose elements are one of [i16,
2609 // i32, i64]. SVE doesn't generally have the same set of instructions to
2610 // perform an extend with the add/sub/mul. There are SMULLB style
2611 // instructions, but they operate on top/bottom, requiring some sort of lane
2612 // interleaving to be used with zext/sext.
2613 unsigned DstEltSize = DstTy->getScalarSizeInBits();
2614 if (!useNeonVector(DstTy) || Args.size() != 2 ||
2615 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2616 return false;
2617
2618 // Determine if the operation has a widening variant. We consider both the
2619 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2620 // instructions.
2621 //
2622 // TODO: Add additional widening operations (e.g., shl, etc.) once we
2623 // verify that their extending operands are eliminated during code
2624 // generation.
2625 Type *SrcTy = SrcOverrideTy;
2626 switch (Opcode) {
2627 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2628 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2629 // The second operand needs to be an extend
2630 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2631 if (!SrcTy)
2632 SrcTy =
2633 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
2634 } else
2635 return false;
2636 break;
2637 case Instruction::Mul: { // SMULL(2), UMULL(2)
2638 // Both operands need to be extends of the same type.
2639 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
2640 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
2641 if (!SrcTy)
2642 SrcTy =
2643 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
2644 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
2645 // If one of the operands is a Zext and the other has enough zero bits to
2646 // be treated as unsigned, we can still general a umull, meaning the zext
2647 // is free.
2648 KnownBits Known =
2649 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
2650 if (Args[0]->getType()->getScalarSizeInBits() -
2651 Known.Zero.countLeadingOnes() >
2652 DstTy->getScalarSizeInBits() / 2)
2653 return false;
2654 if (!SrcTy)
2655 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
2656 DstTy->getScalarSizeInBits() / 2));
2657 } else
2658 return false;
2659 break;
2660 }
2661 default:
2662 return false;
2663 }
2664
2665 // Legalize the destination type and ensure it can be used in a widening
2666 // operation.
2667 auto DstTyL = getTypeLegalizationCost(DstTy);
2668 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
2669 return false;
2670
2671 // Legalize the source type and ensure it can be used in a widening
2672 // operation.
2673 assert(SrcTy && "Expected some SrcTy");
2674 auto SrcTyL = getTypeLegalizationCost(SrcTy);
2675 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2676 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
2677 return false;
2678
2679 // Get the total number of vector elements in the legalized types.
2680 InstructionCost NumDstEls =
2681 DstTyL.first * DstTyL.second.getVectorMinNumElements();
2682 InstructionCost NumSrcEls =
2683 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2684
2685 // Return true if the legalized types have the same number of vector elements
2686 // and the destination element type size is twice that of the source type.
2687 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
2688}
2689
2690// s/urhadd instructions implement the following pattern, making the
2691// extends free:
2692// %x = add ((zext i8 -> i16), 1)
2693// %y = (zext i8 -> i16)
2694// trunc i16 (lshr (add %x, %y), 1) -> i8
2695//
2697 Type *Src) {
2698 // The source should be a legal vector type.
2699 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
2700 (Src->isScalableTy() && !ST->hasSVE2()))
2701 return false;
2702
2703 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2704 return false;
2705
2706 // Look for trunc/shl/add before trying to match the pattern.
2707 const Instruction *Add = ExtUser;
2708 auto *AddUser =
2709 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2710 if (AddUser && AddUser->getOpcode() == Instruction::Add)
2711 Add = AddUser;
2712
2713 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2714 if (!Shr || Shr->getOpcode() != Instruction::LShr)
2715 return false;
2716
2717 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2718 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2719 Src->getScalarSizeInBits() !=
2720 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2721 return false;
2722
2723 // Try to match the whole pattern. Ext could be either the first or second
2724 // m_ZExtOrSExt matched.
2725 Instruction *Ex1, *Ex2;
2726 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
2727 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
2728 return false;
2729
2730 // Ensure both extends are of the same type
2731 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
2732 Ex1->getOpcode() == Ex2->getOpcode())
2733 return true;
2734
2735 return false;
2736}
2737
2739 Type *Src,
2742 const Instruction *I) {
2743 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2744 assert(ISD && "Invalid opcode");
2745 // If the cast is observable, and it is used by a widening instruction (e.g.,
2746 // uaddl, saddw, etc.), it may be free.
2747 if (I && I->hasOneUser()) {
2748 auto *SingleUser = cast<Instruction>(*I->user_begin());
2749 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
2750 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
2751 // For adds only count the second operand as free if both operands are
2752 // extends but not the same operation. (i.e both operands are not free in
2753 // add(sext, zext)).
2754 if (SingleUser->getOpcode() == Instruction::Add) {
2755 if (I == SingleUser->getOperand(1) ||
2756 (isa<CastInst>(SingleUser->getOperand(1)) &&
2757 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2758 return 0;
2759 } else // Others are free so long as isWideningInstruction returned true.
2760 return 0;
2761 }
2762
2763 // The cast will be free for the s/urhadd instructions
2764 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
2765 isExtPartOfAvgExpr(SingleUser, Dst, Src))
2766 return 0;
2767 }
2768
2769 // TODO: Allow non-throughput costs that aren't binary.
2770 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2772 return Cost == 0 ? 0 : 1;
2773 return Cost;
2774 };
2775
2776 EVT SrcTy = TLI->getValueType(DL, Src);
2777 EVT DstTy = TLI->getValueType(DL, Dst);
2778
2779 if (!SrcTy.isSimple() || !DstTy.isSimple())
2780 return AdjustCost(
2781 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2782
2783 static const TypeConversionCostTblEntry BF16Tbl[] = {
2784 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
2785 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
2786 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
2787 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
2788 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
2789 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
2790 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
2791 };
2792
2793 if (ST->hasBF16())
2794 if (const auto *Entry = ConvertCostTableLookup(
2795 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2796 return AdjustCost(Entry->Cost);
2797
2798 static const TypeConversionCostTblEntry ConversionTbl[] = {
2799 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
2800 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
2801 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
2802 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
2803 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
2804 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
2805 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
2806 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
2807 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
2808 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
2809 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
2810 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
2811 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
2812 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
2813 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
2814 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
2815 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
2816 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
2817 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
2818 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
2819
2820 // Truncations on nxvmiN
2821 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
2822 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
2823 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
2824 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
2825 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
2826 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
2827 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
2828 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
2829 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
2830 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
2831 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
2832 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
2833 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
2834 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
2835 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
2836 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
2837 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
2838 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
2839 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
2840 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
2841 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
2842 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
2843 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
2844 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
2845 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
2846 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
2847 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
2848 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
2849 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
2850 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
2851 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
2852 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
2853 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
2854
2855 // The number of shll instructions for the extension.
2856 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
2857 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
2858 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
2859 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
2860 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
2861 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
2862 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
2863 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
2864 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
2865 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
2866 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
2867 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
2868 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
2869 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
2870 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
2871 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
2872
2873 // FP Ext and trunc
2874 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
2875 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
2876 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
2877 // FP16
2878 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
2879 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
2880 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
2881 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
2882 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
2883 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
2884 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
2885 // BF16 (uses shift)
2886 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
2887 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
2888 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
2889 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
2890 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
2891 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
2892 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
2893 // FP Ext and trunc
2894 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
2895 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
2896 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
2897 // FP16
2898 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
2899 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
2900 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
2901 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
2902 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
2903 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
2904 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
2905 // BF16 (more complex, with +bf16 is handled above)
2906 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
2907 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
2908 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
2909 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
2910 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
2911 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
2912 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
2913 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
2914
2915 // LowerVectorINT_TO_FP:
2916 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
2917 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
2918 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
2919 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
2920 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
2921 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
2922
2923 // Complex: to v2f32
2924 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
2925 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
2926 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2},
2927 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
2928 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
2929 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2},
2930
2931 // Complex: to v4f32
2932 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
2933 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
2934 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
2935 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
2936
2937 // Complex: to v8f32
2938 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
2939 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
2940 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
2941 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
2942
2943 // Complex: to v16f32
2944 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
2945 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
2946
2947 // Complex: to v2f64
2948 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
2949 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
2950 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
2951 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
2952 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
2953 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
2954
2955 // Complex: to v4f64
2956 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
2957 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
2958
2959 // LowerVectorFP_TO_INT
2960 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
2961 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
2962 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
2963 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
2964 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
2965 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
2966
2967 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
2968 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
2969 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
2970 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
2971 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
2972 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
2973 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
2974
2975 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
2976 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
2977 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
2978 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
2979 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
2980
2981 // Complex, from nxv2f32.
2982 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
2983 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
2984 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
2985 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
2986 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
2987 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
2988 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
2989 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
2990
2991 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
2992 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
2993 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
2994 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
2995 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
2996 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
2997 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
2998
2999 // Complex, from nxv2f64.
3000 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3001 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3002 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3003 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3004 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3005 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3006 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3007 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3008
3009 // Complex, from nxv4f32.
3010 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3011 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3012 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3013 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3014 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3015 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3016 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3017 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3018
3019 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3020 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3021 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3022 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3023 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3024
3025 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3026 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3027 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3028 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3029 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3030 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3031 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3032
3033 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3034 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3035 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3036 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3037 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3038
3039 // Complex, from nxv8f16.
3040 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3041 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3042 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3043 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3044 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3045 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3046 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3047 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3048
3049 // Complex, from nxv4f16.
3050 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3051 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3052 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3053 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3054 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3055 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3056 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3057 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3058
3059 // Complex, from nxv2f16.
3060 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3061 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3062 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3063 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3064 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3065 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3066 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3067 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3068
3069 // Truncate from nxvmf32 to nxvmf16.
3070 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3071 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3072 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3073
3074 // Truncate from nxvmf64 to nxvmf16.
3075 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3076 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3077 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3078
3079 // Truncate from nxvmf64 to nxvmf32.
3080 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3081 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3082 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3083
3084 // Extend from nxvmf16 to nxvmf32.
3085 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3086 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3087 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3088
3089 // Extend from nxvmf16 to nxvmf64.
3090 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3091 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3092 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3093
3094 // Extend from nxvmf32 to nxvmf64.
3095 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3096 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3097 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3098
3099 // Bitcasts from float to integer
3100 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3101 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3102 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3103
3104 // Bitcasts from integer to float
3105 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3106 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3107 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3108
3109 // Add cost for extending to illegal -too wide- scalable vectors.
3110 // zero/sign extend are implemented by multiple unpack operations,
3111 // where each operation has a cost of 1.
3112 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3113 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3114 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3115 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3116 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3117 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3118
3119 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3120 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3121 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3122 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3123 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3124 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3125 };
3126
3127 // We have to estimate a cost of fixed length operation upon
3128 // SVE registers(operations) with the number of registers required
3129 // for a fixed type to be represented upon SVE registers.
3130 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3131 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3132 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3133 ST->useSVEForFixedLengthVectors(WiderTy)) {
3134 std::pair<InstructionCost, MVT> LT =
3135 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3136 unsigned NumElements =
3137 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3138 return AdjustCost(
3139 LT.first *
3141 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
3142 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3143 CostKind, I));
3144 }
3145
3146 if (const auto *Entry = ConvertCostTableLookup(
3147 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3148 return AdjustCost(Entry->Cost);
3149
3150 static const TypeConversionCostTblEntry FP16Tbl[] = {
3151 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3152 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3153 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3154 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3155 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3156 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3157 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3158 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3159 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3160 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3161 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3162 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3163 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3164 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3165 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3166 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3167 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3168 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3169 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3170 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3171 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3172 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3173 };
3174
3175 if (ST->hasFullFP16())
3176 if (const auto *Entry = ConvertCostTableLookup(
3177 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3178 return AdjustCost(Entry->Cost);
3179
3180 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3183 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3185 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3187 // The standard behaviour in the backend for these cases is to split the
3188 // extend up into two parts:
3189 // 1. Perform an extending load or masked load up to the legal type.
3190 // 2. Extend the loaded data to the final type.
3191 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3192 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3194 Opcode, LegalTy, Src, CCH, CostKind, I);
3196 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3197 return Part1 + Part2;
3198 }
3199
3200 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3201 // but we also want to include the TTI::CastContextHint::Masked case too.
3202 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3204 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3206
3207 return AdjustCost(
3208 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3209}
3210
3212 Type *Dst,
3213 VectorType *VecTy,
3214 unsigned Index) {
3215
3216 // Make sure we were given a valid extend opcode.
3217 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3218 "Invalid opcode");
3219
3220 // We are extending an element we extract from a vector, so the source type
3221 // of the extend is the element type of the vector.
3222 auto *Src = VecTy->getElementType();
3223
3224 // Sign- and zero-extends are for integer types only.
3225 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3226
3227 // Get the cost for the extract. We compute the cost (if any) for the extend
3228 // below.
3230 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3231 CostKind, Index, nullptr, nullptr);
3232
3233 // Legalize the types.
3234 auto VecLT = getTypeLegalizationCost(VecTy);
3235 auto DstVT = TLI->getValueType(DL, Dst);
3236 auto SrcVT = TLI->getValueType(DL, Src);
3237
3238 // If the resulting type is still a vector and the destination type is legal,
3239 // we may get the extension for free. If not, get the default cost for the
3240 // extend.
3241 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3242 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3243 CostKind);
3244
3245 // The destination type should be larger than the element type. If not, get
3246 // the default cost for the extend.
3247 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3248 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3249 CostKind);
3250
3251 switch (Opcode) {
3252 default:
3253 llvm_unreachable("Opcode should be either SExt or ZExt");
3254
3255 // For sign-extends, we only need a smov, which performs the extension
3256 // automatically.
3257 case Instruction::SExt:
3258 return Cost;
3259
3260 // For zero-extends, the extend is performed automatically by a umov unless
3261 // the destination type is i64 and the element type is i8 or i16.
3262 case Instruction::ZExt:
3263 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3264 return Cost;
3265 }
3266
3267 // If we are unable to perform the extend for free, get the default cost.
3268 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3269 CostKind);
3270}
3271
3274 const Instruction *I) {
3276 return Opcode == Instruction::PHI ? 0 : 1;
3277 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3278 // Branches are assumed to be predicted.
3279 return 0;
3280}
3281
3282InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3283 unsigned Opcode, Type *Val, unsigned Index, bool HasRealUse,
3284 const Instruction *I, Value *Scalar,
3285 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
3286 assert(Val->isVectorTy() && "This must be a vector type");
3287
3288 if (Index != -1U) {
3289 // Legalize the type.
3290 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3291
3292 // This type is legalized to a scalar type.
3293 if (!LT.second.isVector())
3294 return 0;
3295
3296 // The type may be split. For fixed-width vectors we can normalize the
3297 // index to the new type.
3298 if (LT.second.isFixedLengthVector()) {
3299 unsigned Width = LT.second.getVectorNumElements();
3300 Index = Index % Width;
3301 }
3302
3303 // The element at index zero is already inside the vector.
3304 // - For a physical (HasRealUse==true) insert-element or extract-element
3305 // instruction that extracts integers, an explicit FPR -> GPR move is
3306 // needed. So it has non-zero cost.
3307 // - For the rest of cases (virtual instruction or element type is float),
3308 // consider the instruction free.
3309 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
3310 return 0;
3311
3312 // This is recognising a LD1 single-element structure to one lane of one
3313 // register instruction. I.e., if this is an `insertelement` instruction,
3314 // and its second operand is a load, then we will generate a LD1, which
3315 // are expensive instructions.
3316 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
3317 return ST->getVectorInsertExtractBaseCost() + 1;
3318
3319 // i1 inserts and extract will include an extra cset or cmp of the vector
3320 // value. Increase the cost by 1 to account.
3321 if (Val->getScalarSizeInBits() == 1)
3322 return ST->getVectorInsertExtractBaseCost() + 1;
3323
3324 // FIXME:
3325 // If the extract-element and insert-element instructions could be
3326 // simplified away (e.g., could be combined into users by looking at use-def
3327 // context), they have no cost. This is not done in the first place for
3328 // compile-time considerations.
3329 }
3330
3331 // In case of Neon, if there exists extractelement from lane != 0 such that
3332 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
3333 // 2. extractelement result feeds into fmul.
3334 // 3. Other operand of fmul is an extractelement from lane 0 or lane
3335 // equivalent to 0.
3336 // then the extractelement can be merged with fmul in the backend and it
3337 // incurs no cost.
3338 // e.g.
3339 // define double @foo(<2 x double> %a) {
3340 // %1 = extractelement <2 x double> %a, i32 0
3341 // %2 = extractelement <2 x double> %a, i32 1
3342 // %res = fmul double %1, %2
3343 // ret double %res
3344 // }
3345 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
3346 auto ExtractCanFuseWithFmul = [&]() {
3347 // We bail out if the extract is from lane 0.
3348 if (Index == 0)
3349 return false;
3350
3351 // Check if the scalar element type of the vector operand of ExtractElement
3352 // instruction is one of the allowed types.
3353 auto IsAllowedScalarTy = [&](const Type *T) {
3354 return T->isFloatTy() || T->isDoubleTy() ||
3355 (T->isHalfTy() && ST->hasFullFP16());
3356 };
3357
3358 // Check if the extractelement user is scalar fmul.
3359 auto IsUserFMulScalarTy = [](const Value *EEUser) {
3360 // Check if the user is scalar fmul.
3361 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
3362 return BO && BO->getOpcode() == BinaryOperator::FMul &&
3363 !BO->getType()->isVectorTy();
3364 };
3365
3366 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
3367 // certain scalar type and a certain vector register width.
3368 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
3369 auto RegWidth =
3371 .getFixedValue();
3372 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
3373 };
3374
3375 // Check if the type constraints on input vector type and result scalar type
3376 // of extractelement instruction are satisfied.
3377 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
3378 return false;
3379
3380 if (Scalar) {
3381 DenseMap<User *, unsigned> UserToExtractIdx;
3382 for (auto *U : Scalar->users()) {
3383 if (!IsUserFMulScalarTy(U))
3384 return false;
3385 // Recording entry for the user is important. Index value is not
3386 // important.
3387 UserToExtractIdx[U];
3388 }
3389 if (UserToExtractIdx.empty())
3390 return false;
3391 for (auto &[S, U, L] : ScalarUserAndIdx) {
3392 for (auto *U : S->users()) {
3393 if (UserToExtractIdx.find(U) != UserToExtractIdx.end()) {
3394 auto *FMul = cast<BinaryOperator>(U);
3395 auto *Op0 = FMul->getOperand(0);
3396 auto *Op1 = FMul->getOperand(1);
3397 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
3398 UserToExtractIdx[U] = L;
3399 break;
3400 }
3401 }
3402 }
3403 }
3404 for (auto &[U, L] : UserToExtractIdx) {
3405 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
3406 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
3407 return false;
3408 }
3409 } else {
3410 const auto *EE = cast<ExtractElementInst>(I);
3411
3412 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
3413 if (!IdxOp)
3414 return false;
3415
3416 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
3417 if (!IsUserFMulScalarTy(U))
3418 return false;
3419
3420 // Check if the other operand of extractelement is also extractelement
3421 // from lane equivalent to 0.
3422 const auto *BO = cast<BinaryOperator>(U);
3423 const auto *OtherEE = dyn_cast<ExtractElementInst>(
3424 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
3425 if (OtherEE) {
3426 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
3427 if (!IdxOp)
3428 return false;
3429 return IsExtractLaneEquivalentToZero(
3430 cast<ConstantInt>(OtherEE->getIndexOperand())
3431 ->getValue()
3432 .getZExtValue(),
3433 OtherEE->getType()->getScalarSizeInBits());
3434 }
3435 return true;
3436 });
3437 }
3438 return true;
3439 };
3440
3441 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
3442 ExtractCanFuseWithFmul())
3443 return 0;
3444
3445 // All other insert/extracts cost this much.
3446 return ST->getVectorInsertExtractBaseCost();
3447}
3448
3451 unsigned Index, Value *Op0,
3452 Value *Op1) {
3453 bool HasRealUse =
3454 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
3455 return getVectorInstrCostHelper(Opcode, Val, Index, HasRealUse);
3456}
3457
3459 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3460 Value *Scalar,
3461 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
3462 return getVectorInstrCostHelper(Opcode, Val, Index, false, nullptr, Scalar,
3463 ScalarUserAndIdx);
3464}
3465
3467 Type *Val,
3469 unsigned Index) {
3470 return getVectorInstrCostHelper(I.getOpcode(), Val, Index,
3471 true /* HasRealUse */, &I);
3472}
3473
3475 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
3477 if (isa<ScalableVectorType>(Ty))
3479 if (Ty->getElementType()->isFloatingPointTy())
3480 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
3481 CostKind);
3482 return DemandedElts.popcount() * (Insert + Extract) *
3484}
3485
3487 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
3490 const Instruction *CxtI) {
3491
3492 // The code-generator is currently not able to handle scalable vectors
3493 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3494 // it. This change will be removed when code-generation for these types is
3495 // sufficiently reliable.
3496 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3497 if (VTy->getElementCount() == ElementCount::getScalable(1))
3499
3500 // TODO: Handle more cost kinds.
3502 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3503 Op2Info, Args, CxtI);
3504
3505 // Legalize the type.
3506 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3507 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3508
3509 switch (ISD) {
3510 default:
3511 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3512 Op2Info);
3513 case ISD::SDIV:
3514 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
3515 // On AArch64, scalar signed division by constants power-of-two are
3516 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
3517 // The OperandValue properties many not be same as that of previous
3518 // operation; conservatively assume OP_None.
3520 Instruction::Add, Ty, CostKind,
3521 Op1Info.getNoProps(), Op2Info.getNoProps());
3522 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
3523 Op1Info.getNoProps(), Op2Info.getNoProps());
3525 Instruction::Select, Ty, CostKind,
3526 Op1Info.getNoProps(), Op2Info.getNoProps());
3527 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3528 Op1Info.getNoProps(), Op2Info.getNoProps());
3529 return Cost;
3530 }
3531 [[fallthrough]];
3532 case ISD::UDIV: {
3533 auto VT = TLI->getValueType(DL, Ty);
3534 if (Op2Info.isConstant() && Op2Info.isUniform()) {
3535 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
3536 // Vector signed division by constant are expanded to the
3537 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
3538 // to MULHS + SUB + SRL + ADD + SRL.
3540 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3542 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3544 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3545 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
3546 }
3547 }
3548
3549 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
3550 // emitted by the backend even when those functions are not declared in the
3551 // module.
3552 if (!VT.isVector() && VT.getSizeInBits() > 64)
3553 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
3554
3556 Opcode, Ty, CostKind, Op1Info, Op2Info);
3557 if (Ty->isVectorTy()) {
3558 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
3559 // SDIV/UDIV operations are lowered using SVE, then we can have less
3560 // costs.
3561 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
3562 ->getPrimitiveSizeInBits()
3563 .getFixedValue() < 128) {
3564 EVT VT = TLI->getValueType(DL, Ty);
3565 static const CostTblEntry DivTbl[]{
3566 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
3567 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
3568 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
3569 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
3570 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
3571 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
3572
3573 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
3574 if (nullptr != Entry)
3575 return Entry->Cost;
3576 }
3577 // For 8/16-bit elements, the cost is higher because the type
3578 // requires promotion and possibly splitting:
3579 if (LT.second.getScalarType() == MVT::i8)
3580 Cost *= 8;
3581 else if (LT.second.getScalarType() == MVT::i16)
3582 Cost *= 4;
3583 return Cost;
3584 } else {
3585 // If one of the operands is a uniform constant then the cost for each
3586 // element is Cost for insertion, extraction and division.
3587 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
3588 // operation with scalar type
3589 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
3590 (Op2Info.isConstant() && Op2Info.isUniform())) {
3591 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
3593 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
3594 return (4 + DivCost) * VTy->getNumElements();
3595 }
3596 }
3597 // On AArch64, without SVE, vector divisions are expanded
3598 // into scalar divisions of each pair of elements.
3599 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
3600 CostKind, Op1Info, Op2Info);
3601 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
3602 Op1Info, Op2Info);
3603 }
3604
3605 // TODO: if one of the arguments is scalar, then it's not necessary to
3606 // double the cost of handling the vector elements.
3607 Cost += Cost;
3608 }
3609 return Cost;
3610 }
3611 case ISD::MUL:
3612 // When SVE is available, then we can lower the v2i64 operation using
3613 // the SVE mul instruction, which has a lower cost.
3614 if (LT.second == MVT::v2i64 && ST->hasSVE())
3615 return LT.first;
3616
3617 // When SVE is not available, there is no MUL.2d instruction,
3618 // which means mul <2 x i64> is expensive as elements are extracted
3619 // from the vectors and the muls scalarized.
3620 // As getScalarizationOverhead is a bit too pessimistic, we
3621 // estimate the cost for a i64 vector directly here, which is:
3622 // - four 2-cost i64 extracts,
3623 // - two 2-cost i64 inserts, and
3624 // - two 1-cost muls.
3625 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
3626 // LT.first = 2 the cost is 28. If both operands are extensions it will not
3627 // need to scalarize so the cost can be cheaper (smull or umull).
3628 // so the cost can be cheaper (smull or umull).
3629 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
3630 return LT.first;
3631 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
3633 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
3634 nullptr, nullptr) *
3635 2 +
3636 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
3637 nullptr, nullptr));
3638 case ISD::ADD:
3639 case ISD::XOR:
3640 case ISD::OR:
3641 case ISD::AND:
3642 case ISD::SRL:
3643 case ISD::SRA:
3644 case ISD::SHL:
3645 // These nodes are marked as 'custom' for combining purposes only.
3646 // We know that they are legal. See LowerAdd in ISelLowering.
3647 return LT.first;
3648
3649 case ISD::FNEG:
3650 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
3651 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
3652 (Ty->isHalfTy() && ST->hasFullFP16())) &&
3653 CxtI &&
3654 ((CxtI->hasOneUse() &&
3655 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
3656 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
3657 return 0;
3658 [[fallthrough]];
3659 case ISD::FADD:
3660 case ISD::FSUB:
3661 // Increase the cost for half and bfloat types if not architecturally
3662 // supported.
3663 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
3664 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
3665 return 2 * LT.first;
3666 if (!Ty->getScalarType()->isFP128Ty())
3667 return LT.first;
3668 [[fallthrough]];
3669 case ISD::FMUL:
3670 case ISD::FDIV:
3671 // These nodes are marked as 'custom' just to lower them to SVE.
3672 // We know said lowering will incur no additional cost.
3673 if (!Ty->getScalarType()->isFP128Ty())
3674 return 2 * LT.first;
3675
3676 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3677 Op2Info);
3678 case ISD::FREM:
3679 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
3680 // those functions are not declared in the module.
3681 if (!Ty->isVectorTy())
3682 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
3683 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3684 Op2Info);
3685 }
3686}
3687
3689 ScalarEvolution *SE,
3690 const SCEV *Ptr) {
3691 // Address computations in vectorized code with non-consecutive addresses will
3692 // likely result in more instructions compared to scalar code where the
3693 // computation can more often be merged into the index mode. The resulting
3694 // extra micro-ops can significantly decrease throughput.
3695 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
3696 int MaxMergeDistance = 64;
3697
3698 if (Ty->isVectorTy() && SE &&
3699 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
3700 return NumVectorInstToHideOverhead;
3701
3702 // In many cases the address computation is not merged into the instruction
3703 // addressing mode.
3704 return 1;
3705}
3706
3708 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3710 TTI::OperandValueInfo Op2Info, const Instruction *I) {
3711 // TODO: Handle other cost kinds.
3713 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3714 Op1Info, Op2Info, I);
3715
3716 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3717 // We don't lower some vector selects well that are wider than the register
3718 // width.
3719 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
3720 // We would need this many instructions to hide the scalarization happening.
3721 const int AmortizationCost = 20;
3722
3723 // If VecPred is not set, check if we can get a predicate from the context
3724 // instruction, if its type matches the requested ValTy.
3725 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
3726 CmpPredicate CurrentPred;
3727 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
3728 m_Value())))
3729 VecPred = CurrentPred;
3730 }
3731 // Check if we have a compare/select chain that can be lowered using
3732 // a (F)CMxx & BFI pair.
3733 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
3734 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
3735 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
3736 VecPred == CmpInst::FCMP_UNE) {
3737 static const auto ValidMinMaxTys = {
3738 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
3739 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
3740 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
3741
3742 auto LT = getTypeLegalizationCost(ValTy);
3743 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
3744 (ST->hasFullFP16() &&
3745 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
3746 return LT.first;
3747 }
3748
3749 static const TypeConversionCostTblEntry
3750 VectorSelectTbl[] = {
3751 { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },
3752 { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },
3753 { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },
3754 { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },
3755 { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },
3756 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
3757 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
3758 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
3759 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
3760 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
3761 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
3762 };
3763
3764 EVT SelCondTy = TLI->getValueType(DL, CondTy);
3765 EVT SelValTy = TLI->getValueType(DL, ValTy);
3766 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
3767 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
3768 SelCondTy.getSimpleVT(),
3769 SelValTy.getSimpleVT()))
3770 return Entry->Cost;
3771 }
3772 }
3773
3774 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
3775 auto LT = getTypeLegalizationCost(ValTy);
3776 // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
3777 if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3778 return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn
3779 }
3780
3781 // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
3782 // FIXME: This can apply to more conditions and add/sub if it can be shown to
3783 // be profitable.
3784 if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
3785 ICmpInst::isEquality(VecPred) &&
3786 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
3787 match(I->getOperand(1), m_Zero()) &&
3788 match(I->getOperand(0), m_And(m_Value(), m_Value())))
3789 return 0;
3790
3791 // The base case handles scalable vectors fine for now, since it treats the
3792 // cost as 1 * legalization cost.
3793 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3794 Op1Info, Op2Info, I);
3795}
3796
3798AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3800 if (ST->requiresStrictAlign()) {
3801 // TODO: Add cost modeling for strict align. Misaligned loads expand to
3802 // a bunch of instructions when strict align is enabled.
3803 return Options;
3804 }
3805 Options.AllowOverlappingLoads = true;
3806 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3807 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3808 // TODO: Though vector loads usually perform well on AArch64, in some targets
3809 // they may wake up the FP unit, which raises the power consumption. Perhaps
3810 // they could be used with no holds barred (-O3).
3811 Options.LoadSizes = {8, 4, 2, 1};
3812 Options.AllowedTailExpansions = {3, 5, 6};
3813 return Options;
3814}
3815
3817 return ST->hasSVE();
3818}
3819
3822 Align Alignment, unsigned AddressSpace,
3824 if (useNeonVector(Src))
3825 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3826 CostKind);
3827 auto LT = getTypeLegalizationCost(Src);
3828 if (!LT.first.isValid())
3830
3831 // Return an invalid cost for element types that we are unable to lower.
3832 auto *VT = cast<VectorType>(Src);
3833 if (VT->getElementType()->isIntegerTy(1))
3835
3836 // The code-generator is currently not able to handle scalable vectors
3837 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3838 // it. This change will be removed when code-generation for these types is
3839 // sufficiently reliable.
3840 if (VT->getElementCount() == ElementCount::getScalable(1))
3842
3843 return LT.first;
3844}
3845
3846// This function returns gather/scatter overhead either from
3847// user-provided value or specialized values per-target from \p ST.
3848static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
3849 const AArch64Subtarget *ST) {
3850 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
3851 "Should be called on only load or stores.");
3852 switch (Opcode) {
3853 case Instruction::Load:
3854 if (SVEGatherOverhead.getNumOccurrences() > 0)
3855 return SVEGatherOverhead;
3856 return ST->getGatherOverhead();
3857 break;
3858 case Instruction::Store:
3859 if (SVEScatterOverhead.getNumOccurrences() > 0)
3860 return SVEScatterOverhead;
3861 return ST->getScatterOverhead();
3862 break;
3863 default:
3864 llvm_unreachable("Shouldn't have reached here");
3865 }
3866}
3867
3869 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
3870 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
3871 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
3872 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
3873 Alignment, CostKind, I);
3874 auto *VT = cast<VectorType>(DataTy);
3875 auto LT = getTypeLegalizationCost(DataTy);
3876 if (!LT.first.isValid())
3878
3879 // Return an invalid cost for element types that we are unable to lower.
3880 if (!LT.second.isVector() ||
3881 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
3882 VT->getElementType()->isIntegerTy(1))
3884
3885 // The code-generator is currently not able to handle scalable vectors
3886 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3887 // it. This change will be removed when code-generation for these types is
3888 // sufficiently reliable.
3889 if (VT->getElementCount() == ElementCount::getScalable(1))
3891
3892 ElementCount LegalVF = LT.second.getVectorElementCount();
3893 InstructionCost MemOpCost =
3894 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
3895 {TTI::OK_AnyValue, TTI::OP_None}, I);
3896 // Add on an overhead cost for using gathers/scatters.
3897 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
3898 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
3899}
3900
3902 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
3903}
3904
3906 MaybeAlign Alignment,
3907 unsigned AddressSpace,
3909 TTI::OperandValueInfo OpInfo,
3910 const Instruction *I) {
3911 EVT VT = TLI->getValueType(DL, Ty, true);
3912 // Type legalization can't handle structs
3913 if (VT == MVT::Other)
3914 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
3915 CostKind);
3916
3917 auto LT = getTypeLegalizationCost(Ty);
3918 if (!LT.first.isValid())
3920
3921 // The code-generator is currently not able to handle scalable vectors
3922 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3923 // it. This change will be removed when code-generation for these types is
3924 // sufficiently reliable.
3925 // We also only support full register predicate loads and stores.
3926 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3927 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
3928 (VTy->getElementType()->isIntegerTy(1) &&
3929 !VTy->getElementCount().isKnownMultipleOf(
3932
3933 // TODO: consider latency as well for TCK_SizeAndLatency.
3935 return LT.first;
3936
3938 return 1;
3939
3940 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3941 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
3942 // Unaligned stores are extremely inefficient. We don't split all
3943 // unaligned 128-bit stores because the negative impact that has shown in
3944 // practice on inlined block copy code.
3945 // We make such stores expensive so that we will only vectorize if there
3946 // are 6 other instructions getting vectorized.
3947 const int AmortizationCost = 6;
3948
3949 return LT.first * 2 * AmortizationCost;
3950 }
3951
3952 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
3953 if (Ty->isPtrOrPtrVectorTy())
3954 return LT.first;
3955
3956 if (useNeonVector(Ty)) {
3957 // Check truncating stores and extending loads.
3958 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3959 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
3960 if (VT == MVT::v4i8)
3961 return 2;
3962 // Otherwise we need to scalarize.
3963 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3964 }
3965 EVT EltVT = VT.getVectorElementType();
3966 unsigned EltSize = EltVT.getScalarSizeInBits();
3967 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
3968 VT.getVectorNumElements() >= (128 / EltSize) || !Alignment ||
3969 *Alignment != Align(1))
3970 return LT.first;
3971 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
3972 // widening to v4i8, which produces suboptimal results.
3973 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
3974 return LT.first;
3975
3976 // Check non-power-of-2 loads/stores for legal vector element types with
3977 // NEON. Non-power-of-2 memory ops will get broken down to a set of
3978 // operations on smaller power-of-2 ops, including ld1/st1.
3979 LLVMContext &C = Ty->getContext();
3981 SmallVector<EVT> TypeWorklist;
3982 TypeWorklist.push_back(VT);
3983 while (!TypeWorklist.empty()) {
3984 EVT CurrVT = TypeWorklist.pop_back_val();
3985 unsigned CurrNumElements = CurrVT.getVectorNumElements();
3986 if (isPowerOf2_32(CurrNumElements)) {
3987 Cost += 1;
3988 continue;
3989 }
3990
3991 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
3992 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
3993 TypeWorklist.push_back(
3994 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
3995 }
3996 return Cost;
3997 }
3998
3999 return LT.first;
4000}
4001
4003 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
4004 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4005 bool UseMaskForCond, bool UseMaskForGaps) {
4006 assert(Factor >= 2 && "Invalid interleave factor");
4007 auto *VecVTy = cast<VectorType>(VecTy);
4008
4009 if (VecTy->isScalableTy() && !ST->hasSVE())
4011
4012 // Vectorization for masked interleaved accesses is only enabled for scalable
4013 // VF.
4014 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
4016
4017 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
4018 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
4019 auto *SubVecTy =
4020 VectorType::get(VecVTy->getElementType(),
4021 VecVTy->getElementCount().divideCoefficientBy(Factor));
4022
4023 // ldN/stN only support legal vector types of size 64 or 128 in bits.
4024 // Accesses having vector types that are a multiple of 128 bits can be
4025 // matched to more than one ldN/stN instruction.
4026 bool UseScalable;
4027 if (MinElts % Factor == 0 &&
4028 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
4029 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
4030 }
4031
4032 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4033 Alignment, AddressSpace, CostKind,
4034 UseMaskForCond, UseMaskForGaps);
4035}
4036
4041 for (auto *I : Tys) {
4042 if (!I->isVectorTy())
4043 continue;
4044 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
4045 128)
4046 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
4047 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
4048 }
4049 return Cost;
4050}
4051
4053 return ST->getMaxInterleaveFactor();
4054}
4055
4056// For Falkor, we want to avoid having too many strided loads in a loop since
4057// that can exhaust the HW prefetcher resources. We adjust the unroller
4058// MaxCount preference below to attempt to ensure unrolling doesn't create too
4059// many strided loads.
4060static void
4063 enum { MaxStridedLoads = 7 };
4064 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
4065 int StridedLoads = 0;
4066 // FIXME? We could make this more precise by looking at the CFG and
4067 // e.g. not counting loads in each side of an if-then-else diamond.
4068 for (const auto BB : L->blocks()) {
4069 for (auto &I : *BB) {
4070 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
4071 if (!LMemI)
4072 continue;
4073
4074 Value *PtrValue = LMemI->getPointerOperand();
4075 if (L->isLoopInvariant(PtrValue))
4076 continue;
4077
4078 const SCEV *LSCEV = SE.getSCEV(PtrValue);
4079 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
4080 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
4081 continue;
4082
4083 // FIXME? We could take pairing of unrolled load copies into account
4084 // by looking at the AddRec, but we would probably have to limit this
4085 // to loops with no stores or other memory optimization barriers.
4086 ++StridedLoads;
4087 // We've seen enough strided loads that seeing more won't make a
4088 // difference.
4089 if (StridedLoads > MaxStridedLoads / 2)
4090 return StridedLoads;
4091 }
4092 }
4093 return StridedLoads;
4094 };
4095
4096 int StridedLoads = countStridedLoads(L, SE);
4097 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
4098 << " strided loads\n");
4099 // Pick the largest power of 2 unroll count that won't result in too many
4100 // strided loads.
4101 if (StridedLoads) {
4102 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
4103 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
4104 << UP.MaxCount << '\n');
4105 }
4106}
4107
4108/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
4109/// OOO engine's wide instruction window and various predictors.
4110static void
4114 // Limit loops with structure that is highly likely to benefit from runtime
4115 // unrolling; that is we exclude outer loops, loops with multiple exits and
4116 // many blocks (i.e. likely with complex control flow). Note that the
4117 // heuristics here may be overly conservative and we err on the side of
4118 // avoiding runtime unrolling rather than unroll excessively. They are all
4119 // subject to further refinement.
4120 if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8)
4121 return;
4122
4123 const SCEV *BTC = SE.getBackedgeTakenCount(L);
4124 if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
4125 (SE.getSmallConstantMaxTripCount(L) > 0 &&
4126 SE.getSmallConstantMaxTripCount(L) <= 32))
4127 return;
4128 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
4129 return;
4130
4131 int64_t Size = 0;
4132 for (auto *BB : L->getBlocks()) {
4133 for (auto &I : *BB) {
4134 if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
4135 return;
4136 SmallVector<const Value *, 4> Operands(I.operand_values());
4137 Size +=
4139 }
4140 }
4141
4142 // Limit to loops with trip counts that are cheap to expand.
4143 UP.SCEVExpansionBudget = 1;
4144
4145 // Try to unroll small, single block loops, if they have load/store
4146 // dependencies, to expose more parallel memory access streams.
4147 BasicBlock *Header = L->getHeader();
4148 if (Header == L->getLoopLatch()) {
4149 if (Size > 8)
4150 return;
4151
4152 SmallPtrSet<Value *, 8> LoadedValues;
4154 for (auto *BB : L->blocks()) {
4155 for (auto &I : *BB) {
4157 if (!Ptr)
4158 continue;
4159 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4160 if (SE.isLoopInvariant(PtrSCEV, L))
4161 continue;
4162 if (isa<LoadInst>(&I))
4163 LoadedValues.insert(&I);
4164 else
4165 Stores.push_back(cast<StoreInst>(&I));
4166 }
4167 }
4168
4169 // Try to find an unroll count that maximizes the use of the instruction
4170 // window, i.e. trying to fetch as many instructions per cycle as possible.
4171 unsigned MaxInstsPerLine = 16;
4172 unsigned UC = 1;
4173 unsigned BestUC = 1;
4174 unsigned SizeWithBestUC = BestUC * Size;
4175 while (UC <= 8) {
4176 unsigned SizeWithUC = UC * Size;
4177 if (SizeWithUC > 48)
4178 break;
4179 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4180 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4181 BestUC = UC;
4182 SizeWithBestUC = BestUC * Size;
4183 }
4184 UC++;
4185 }
4186
4187 if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
4188 return LoadedValues.contains(SI->getOperand(0));
4189 }))
4190 return;
4191
4192 UP.Runtime = true;
4193 UP.DefaultUnrollRuntimeCount = BestUC;
4194 return;
4195 }
4196
4197 // Try to runtime-unroll loops with early-continues depending on loop-varying
4198 // loads; this helps with branch-prediction for the early-continues.
4199 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
4200 auto *Latch = L->getLoopLatch();
4202 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
4203 none_of(Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
4204 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
4205 return;
4206
4207 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
4208 [&](Instruction *I, unsigned Depth) -> bool {
4209 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
4210 return false;
4211
4212 if (isa<LoadInst>(I))
4213 return true;
4214
4215 return any_of(I->operands(), [&](Value *V) {
4216 auto *I = dyn_cast<Instruction>(V);
4217 return I && DependsOnLoopLoad(I, Depth + 1);
4218 });
4219 };
4220 CmpPredicate Pred;
4221 Instruction *I;
4222 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
4223 m_Value())) &&
4224 DependsOnLoopLoad(I, 0)) {
4225 UP.Runtime = true;
4226 }
4227}
4228
4232 // Enable partial unrolling and runtime unrolling.
4233 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
4234
4235 UP.UpperBound = true;
4236
4237 // For inner loop, it is more likely to be a hot one, and the runtime check
4238 // can be promoted out from LICM pass, so the overhead is less, let's try
4239 // a larger threshold to unroll more loops.
4240 if (L->getLoopDepth() > 1)
4241 UP.PartialThreshold *= 2;
4242
4243 // Disable partial & runtime unrolling on -Os.
4245
4246 // Apply subtarget-specific unrolling preferences.
4247 switch (ST->getProcFamily()) {
4248 case AArch64Subtarget::AppleA14:
4249 case AArch64Subtarget::AppleA15:
4250 case AArch64Subtarget::AppleA16:
4251 case AArch64Subtarget::AppleM4:
4252 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
4253 break;
4254 case AArch64Subtarget::Falkor:
4257 break;
4258 default:
4259 break;
4260 }
4261
4262 // Scan the loop: don't unroll loops with calls as this could prevent
4263 // inlining. Don't unroll vector loops either, as they don't benefit much from
4264 // unrolling.
4265 for (auto *BB : L->getBlocks()) {
4266 for (auto &I : *BB) {
4267 // Don't unroll vectorised loop.
4268 if (I.getType()->isVectorTy())
4269 return;
4270
4271 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4272 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
4273 if (!isLoweredToCall(F))
4274 continue;
4275 }
4276 return;
4277 }
4278 }
4279 }
4280
4281 // Enable runtime unrolling for in-order models
4282 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
4283 // checking for that case, we can ensure that the default behaviour is
4284 // unchanged
4286 !ST->getSchedModel().isOutOfOrder()) {
4287 UP.Runtime = true;
4288 UP.Partial = true;
4289 UP.UnrollRemainder = true;
4291
4292 UP.UnrollAndJam = true;
4294 }
4295}
4296
4300}
4301
4303 Type *ExpectedType) {
4304 switch (Inst->getIntrinsicID()) {
4305 default:
4306 return nullptr;
4307 case Intrinsic::aarch64_neon_st2:
4308 case Intrinsic::aarch64_neon_st3:
4309 case Intrinsic::aarch64_neon_st4: {
4310 // Create a struct type
4311 StructType *ST = dyn_cast<StructType>(ExpectedType);
4312 if (!ST)
4313 return nullptr;
4314 unsigned NumElts = Inst->arg_size() - 1;
4315 if (ST->getNumElements() != NumElts)
4316 return nullptr;
4317 for (unsigned i = 0, e = NumElts; i != e; ++i) {
4318 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
4319 return nullptr;
4320 }
4321 Value *Res = PoisonValue::get(ExpectedType);
4322 IRBuilder<> Builder(Inst);
4323 for (unsigned i = 0, e = NumElts; i != e; ++i) {
4324 Value *L = Inst->getArgOperand(i);
4325 Res = Builder.CreateInsertValue(Res, L, i);
4326 }
4327 return Res;
4328 }
4329 case Intrinsic::aarch64_neon_ld2:
4330 case Intrinsic::aarch64_neon_ld3:
4331 case Intrinsic::aarch64_neon_ld4:
4332 if (Inst->getType() == ExpectedType)
4333 return Inst;
4334 return nullptr;
4335 }
4336}
4337
4339 MemIntrinsicInfo &Info) {
4340 switch (Inst->getIntrinsicID()) {
4341 default:
4342 break;
4343 case Intrinsic::aarch64_neon_ld2:
4344 case Intrinsic::aarch64_neon_ld3:
4345 case Intrinsic::aarch64_neon_ld4:
4346 Info.ReadMem = true;
4347 Info.WriteMem = false;
4348 Info.PtrVal = Inst->getArgOperand(0);
4349 break;
4350 case Intrinsic::aarch64_neon_st2:
4351 case Intrinsic::aarch64_neon_st3:
4352 case Intrinsic::aarch64_neon_st4:
4353 Info.ReadMem = false;
4354 Info.WriteMem = true;
4355 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
4356 break;
4357 }
4358
4359 switch (Inst->getIntrinsicID()) {
4360 default:
4361 return false;
4362 case Intrinsic::aarch64_neon_ld2:
4363 case Intrinsic::aarch64_neon_st2:
4364 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
4365 break;
4366 case Intrinsic::aarch64_neon_ld3:
4367 case Intrinsic::aarch64_neon_st3:
4368 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
4369 break;
4370 case Intrinsic::aarch64_neon_ld4:
4371 case Intrinsic::aarch64_neon_st4:
4372 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
4373 break;
4374 }
4375 return true;
4376}
4377
4378/// See if \p I should be considered for address type promotion. We check if \p
4379/// I is a sext with right type and used in memory accesses. If it used in a
4380/// "complex" getelementptr, we allow it to be promoted without finding other
4381/// sext instructions that sign extended the same initial value. A getelementptr
4382/// is considered as "complex" if it has more than 2 operands.
4384 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
4385 bool Considerable = false;
4386 AllowPromotionWithoutCommonHeader = false;
4387 if (!isa<SExtInst>(&I))
4388 return false;
4389 Type *ConsideredSExtType =
4390 Type::getInt64Ty(I.getParent()->getParent()->getContext());
4391 if (I.getType() != ConsideredSExtType)
4392 return false;
4393 // See if the sext is the one with the right type and used in at least one
4394 // GetElementPtrInst.
4395 for (const User *U : I.users()) {
4396 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
4397 Considerable = true;
4398 // A getelementptr is considered as "complex" if it has more than 2
4399 // operands. We will promote a SExt used in such complex GEP as we
4400 // expect some computation to be merged if they are done on 64 bits.
4401 if (GEPInst->getNumOperands() > 2) {
4402 AllowPromotionWithoutCommonHeader = true;
4403 break;
4404 }
4405 }
4406 }
4407 return Considerable;
4408}
4409
4411 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
4412 if (!VF.isScalable())
4413 return true;
4414
4415 Type *Ty = RdxDesc.getRecurrenceType();
4417 return false;
4418
4419 switch (RdxDesc.getRecurrenceKind()) {
4420 case RecurKind::Add:
4421 case RecurKind::FAdd:
4422 case RecurKind::And:
4423 case RecurKind::Or:
4424 case RecurKind::Xor:
4425 case RecurKind::SMin:
4426 case RecurKind::SMax:
4427 case RecurKind::UMin:
4428 case RecurKind::UMax:
4429 case RecurKind::FMin:
4430 case RecurKind::FMax:
4431 case RecurKind::FMulAdd:
4432 case RecurKind::IAnyOf:
4433 case RecurKind::FAnyOf:
4434 return true;
4435 default:
4436 return false;
4437 }
4438}
4439
4442 FastMathFlags FMF,
4444 // The code-generator is currently not able to handle scalable vectors
4445 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4446 // it. This change will be removed when code-generation for these types is
4447 // sufficiently reliable.
4448 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4449 if (VTy->getElementCount() == ElementCount::getScalable(1))
4451
4452 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4453
4454 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
4455 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
4456
4457 InstructionCost LegalizationCost = 0;
4458 if (LT.first > 1) {
4459 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
4460 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
4461 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
4462 }
4463
4464 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
4465}
4466
4468 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
4469 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
4470 InstructionCost LegalizationCost = 0;
4471 if (LT.first > 1) {
4472 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
4473 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
4474 LegalizationCost *= LT.first - 1;
4475 }
4476
4477 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4478 assert(ISD && "Invalid opcode");
4479 // Add the final reduction cost for the legal horizontal reduction
4480 switch (ISD) {
4481 case ISD::ADD:
4482 case ISD::AND:
4483 case ISD::OR:
4484 case ISD::XOR:
4485 case ISD::FADD:
4486 return LegalizationCost + 2;
4487 default:
4489 }
4490}
4491
4494 std::optional<FastMathFlags> FMF,
4496 // The code-generator is currently not able to handle scalable vectors
4497 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4498 // it. This change will be removed when code-generation for these types is
4499 // sufficiently reliable.
4500 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
4501 if (VTy->getElementCount() == ElementCount::getScalable(1))
4503
4505 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
4506 InstructionCost BaseCost =
4507 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
4508 // Add on extra cost to reflect the extra overhead on some CPUs. We still
4509 // end up vectorizing for more computationally intensive loops.
4510 return BaseCost + FixedVTy->getNumElements();
4511 }
4512
4513 if (Opcode != Instruction::FAdd)
4515
4516 auto *VTy = cast<ScalableVectorType>(ValTy);
4518 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
4519 Cost *= getMaxNumElements(VTy->getElementCount());
4520 return Cost;
4521 }
4522
4523 if (isa<ScalableVectorType>(ValTy))
4524 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
4525
4526 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
4527 MVT MTy = LT.second;
4528 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4529 assert(ISD && "Invalid opcode");
4530
4531 // Horizontal adds can use the 'addv' instruction. We model the cost of these
4532 // instructions as twice a normal vector add, plus 1 for each legalization
4533 // step (LT.first). This is the only arithmetic vector reduction operation for
4534 // which we have an instruction.
4535 // OR, XOR and AND costs should match the codegen from:
4536 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
4537 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
4538 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
4539 static const CostTblEntry CostTblNoPairwise[]{
4540 {ISD::ADD, MVT::v8i8, 2},
4541 {ISD::ADD, MVT::v16i8, 2},
4542 {ISD::ADD, MVT::v4i16, 2},
4543 {ISD::ADD, MVT::v8i16, 2},
4544 {ISD::ADD, MVT::v4i32, 2},
4545 {ISD::ADD, MVT::v2i64, 2},
4546 {ISD::OR, MVT::v8i8, 15},
4547 {ISD::OR, MVT::v16i8, 17},
4548 {ISD::OR, MVT::v4i16, 7},
4549 {ISD::OR, MVT::v8i16, 9},
4550 {ISD::OR, MVT::v2i32, 3},
4551 {ISD::OR, MVT::v4i32, 5},
4552 {ISD::OR, MVT::v2i64, 3},
4553 {ISD::XOR, MVT::v8i8, 15},
4554 {ISD::XOR, MVT::v16i8, 17},
4555 {ISD::XOR, MVT::v4i16, 7},
4556 {ISD::XOR, MVT::v8i16, 9},
4557 {ISD::XOR, MVT::v2i32, 3},
4558 {ISD::XOR, MVT::v4i32, 5},
4559 {ISD::XOR, MVT::v2i64, 3},
4560 {ISD::AND, MVT::v8i8, 15},
4561 {ISD::AND, MVT::v16i8, 17},
4562 {ISD::AND, MVT::v4i16, 7},
4563 {ISD::AND, MVT::v8i16, 9},
4564 {ISD::AND, MVT::v2i32, 3},
4565 {ISD::AND, MVT::v4i32, 5},
4566 {ISD::AND, MVT::v2i64, 3},
4567 };
4568 switch (ISD) {
4569 default:
4570 break;
4571 case ISD::FADD:
4572 if (Type *EltTy = ValTy->getScalarType();
4573 // FIXME: For half types without fullfp16 support, this could extend and
4574 // use a fp32 faddp reduction but current codegen unrolls.
4575 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
4576 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
4577 const unsigned NElts = MTy.getVectorNumElements();
4578 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
4579 isPowerOf2_32(NElts))
4580 // Reduction corresponding to series of fadd instructions is lowered to
4581 // series of faddp instructions. faddp has latency/throughput that
4582 // matches fadd instruction and hence, every faddp instruction can be
4583 // considered to have a relative cost = 1 with
4584 // CostKind = TCK_RecipThroughput.
4585 // An faddp will pairwise add vector elements, so the size of input
4586 // vector reduces by half every time, requiring
4587 // #(faddp instructions) = log2_32(NElts).
4588 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
4589 }
4590 break;
4591 case ISD::ADD:
4592 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
4593 return (LT.first - 1) + Entry->Cost;
4594 break;
4595 case ISD::XOR:
4596 case ISD::AND:
4597 case ISD::OR:
4598 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
4599 if (!Entry)
4600 break;
4601 auto *ValVTy = cast<FixedVectorType>(ValTy);
4602 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
4603 isPowerOf2_32(ValVTy->getNumElements())) {
4604 InstructionCost ExtraCost = 0;
4605 if (LT.first != 1) {
4606 // Type needs to be split, so there is an extra cost of LT.first - 1
4607 // arithmetic ops.
4608 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
4609 MTy.getVectorNumElements());
4610 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
4611 ExtraCost *= LT.first - 1;
4612 }
4613 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
4614 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
4615 return Cost + ExtraCost;
4616 }
4617 break;
4618 }
4619 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
4620}
4621
4623 static const CostTblEntry ShuffleTbl[] = {
4624 { TTI::SK_Splice, MVT::nxv16i8, 1 },
4625 { TTI::SK_Splice, MVT::nxv8i16, 1 },
4626 { TTI::SK_Splice, MVT::nxv4i32, 1 },
4627 { TTI::SK_Splice, MVT::nxv2i64, 1 },
4628 { TTI::SK_Splice, MVT::nxv2f16, 1 },
4629 { TTI::SK_Splice, MVT::nxv4f16, 1 },
4630 { TTI::SK_Splice, MVT::nxv8f16, 1 },
4631 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
4632 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
4633 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
4634 { TTI::SK_Splice, MVT::nxv2f32, 1 },
4635 { TTI::SK_Splice, MVT::nxv4f32, 1 },
4636 { TTI::SK_Splice, MVT::nxv2f64, 1 },
4637 };
4638
4639 // The code-generator is currently not able to handle scalable vectors
4640 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4641 // it. This change will be removed when code-generation for these types is
4642 // sufficiently reliable.
4645
4646 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
4647 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
4649 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
4650 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
4651 : LT.second;
4652 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
4653 InstructionCost LegalizationCost = 0;
4654 if (Index < 0) {
4655 LegalizationCost =
4656 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
4658 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
4660 }
4661
4662 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
4663 // Cost performed on a promoted type.
4664 if (LT.second.getScalarType() == MVT::i1) {
4665 LegalizationCost +=
4666 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
4668 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
4670 }
4671 const auto *Entry =
4672 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
4673 assert(Entry && "Illegal Type for Splice");
4674 LegalizationCost += Entry->Cost;
4675 return LegalizationCost * LT.first;
4676}
4677
4679 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
4682 std::optional<unsigned> BinOp) const {
4685
4686 if (Opcode != Instruction::Add)
4687 return Invalid;
4688
4689 if (InputTypeA != InputTypeB)
4690 return Invalid;
4691
4692 EVT InputEVT = EVT::getEVT(InputTypeA);
4693 EVT AccumEVT = EVT::getEVT(AccumType);
4694
4695 unsigned VFMinValue = VF.getKnownMinValue();
4696
4697 if (VF.isScalable()) {
4699 return Invalid;
4700
4701 // Don't accept a partial reduction if the scaled accumulator is vscale x 1,
4702 // since we can't lower that type.
4703 unsigned Scale =
4704 AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits();
4705 if (VFMinValue == Scale)
4706 return Invalid;
4707 }
4708 if (VF.isFixed() &&
4709 (!ST->isNeonAvailable() || !ST->hasDotProd() || AccumEVT == MVT::i64))
4710 return Invalid;
4711
4712 if (InputEVT == MVT::i8) {
4713 switch (VFMinValue) {
4714 default:
4715 return Invalid;
4716 case 8:
4717 if (AccumEVT == MVT::i32)
4718 Cost *= 2;
4719 else if (AccumEVT != MVT::i64)
4720 return Invalid;
4721 break;
4722 case 16:
4723 if (AccumEVT == MVT::i64)
4724 Cost *= 2;
4725 else if (AccumEVT != MVT::i32)
4726 return Invalid;
4727 break;
4728 }
4729 } else if (InputEVT == MVT::i16) {
4730 // FIXME: Allow i32 accumulator but increase cost, as we would extend
4731 // it to i64.
4732 if (VFMinValue != 8 || AccumEVT != MVT::i64)
4733 return Invalid;
4734 } else
4735 return Invalid;
4736
4737 // AArch64 supports lowering mixed extensions to a usdot but only if the
4738 // i8mm or sve/streaming features are available.
4739 if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None ||
4740 (OpAExtend != OpBExtend && !ST->hasMatMulInt8() &&
4742 return Invalid;
4743
4744 if (!BinOp || *BinOp != Instruction::Mul)
4745 return Invalid;
4746
4747 return Cost;
4748}
4749
4752 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
4753 ArrayRef<const Value *> Args, const Instruction *CxtI) {
4754 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
4755
4756 // If we have a Mask, and the LT is being legalized somehow, split the Mask
4757 // into smaller vectors and sum the cost of each shuffle.
4758 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
4759 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
4760 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
4761
4762 // Check for LD3/LD4 instructions, which are represented in llvm IR as
4763 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
4764 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
4765 // cost than just the load.
4766 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
4769 return std::max<InstructionCost>(1, LT.first / 4);
4770
4771 // Check for ST3/ST4 instructions, which are represented in llvm IR as
4772 // store(interleaving-shuffle). The shuffle cost could potentially be free,
4773 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
4774 // cost than just the store.
4775 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
4777 Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||
4779 Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)))
4780 return LT.first;
4781
4782 unsigned TpNumElts = Mask.size();
4783 unsigned LTNumElts = LT.second.getVectorNumElements();
4784 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
4785 VectorType *NTp =
4786 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
4788 for (unsigned N = 0; N < NumVecs; N++) {
4789 SmallVector<int> NMask;
4790 // Split the existing mask into chunks of size LTNumElts. Track the source
4791 // sub-vectors to ensure the result has at most 2 inputs.
4792 unsigned Source1, Source2;
4793 unsigned NumSources = 0;
4794 for (unsigned E = 0; E < LTNumElts; E++) {
4795 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
4797 if (MaskElt < 0) {
4799 continue;
4800 }
4801
4802 // Calculate which source from the input this comes from and whether it
4803 // is new to us.
4804 unsigned Source = MaskElt / LTNumElts;
4805 if (NumSources == 0) {
4806 Source1 = Source;
4807 NumSources = 1;
4808 } else if (NumSources == 1 && Source != Source1) {
4809 Source2 = Source;
4810 NumSources = 2;
4811 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
4812 NumSources++;
4813 }
4814
4815 // Add to the new mask. For the NumSources>2 case these are not correct,
4816 // but are only used for the modular lane number.
4817 if (Source == Source1)
4818 NMask.push_back(MaskElt % LTNumElts);
4819 else if (Source == Source2)
4820 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
4821 else
4822 NMask.push_back(MaskElt % LTNumElts);
4823 }
4824 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
4825 // getShuffleCost. If not then cost it using the worst case as the number
4826 // of element moves into a new vector.
4827 if (NumSources <= 2)
4828 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
4830 NTp, NMask, CostKind, 0, nullptr, Args, CxtI);
4831 else
4832 Cost += LTNumElts;
4833 }
4834 return Cost;
4835 }
4836
4837 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
4838 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
4839 // A subvector extract can be implemented with an ext (or trivial extract, if
4840 // from lane 0). This currently only handles low or high extracts to prevent
4841 // SLP vectorizer regressions.
4842 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
4843 if (LT.second.is128BitVector() &&
4844 cast<FixedVectorType>(SubTp)->getNumElements() ==
4845 LT.second.getVectorNumElements() / 2) {
4846 if (Index == 0)
4847 return 0;
4848 if (Index == (int)LT.second.getVectorNumElements() / 2)
4849 return 1;
4850 }
4852 }
4853
4854 // Check for broadcast loads, which are supported by the LD1R instruction.
4855 // In terms of code-size, the shuffle vector is free when a load + dup get
4856 // folded into a LD1R. That's what we check and return here. For performance
4857 // and reciprocal throughput, a LD1R is not completely free. In this case, we
4858 // return the cost for the broadcast below (i.e. 1 for most/all types), so
4859 // that we model the load + dup sequence slightly higher because LD1R is a
4860 // high latency instruction.
4861 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
4862 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
4863 if (IsLoad && LT.second.isVector() &&
4865 LT.second.getVectorElementCount()))
4866 return 0;
4867 }
4868
4869 // If we have 4 elements for the shuffle and a Mask, get the cost straight
4870 // from the perfect shuffle tables.
4871 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
4872 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
4873 all_of(Mask, [](int E) { return E < 8; }))
4874 return getPerfectShuffleCost(Mask);
4875
4876 // Check for identity masks, which we can treat as free.
4877 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
4878 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
4879 all_of(enumerate(Mask), [](const auto &M) {
4880 return M.value() < 0 || M.value() == (int)M.index();
4881 }))
4882 return 0;
4883
4884 // Check for other shuffles that are not SK_ kinds but we have native
4885 // instructions for, for example ZIP and UZP.
4886 unsigned Unused;
4887 if (LT.second.isFixedLengthVector() &&
4888 LT.second.getVectorNumElements() == Mask.size() &&
4889 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
4890 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4891 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4892 // Check for non-zero lane splats
4893 all_of(drop_begin(Mask),
4894 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
4895 return 1;
4896
4897 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
4898 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
4899 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
4900 static const CostTblEntry ShuffleTbl[] = {
4901 // Broadcast shuffle kinds can be performed with 'dup'.
4902 {TTI::SK_Broadcast, MVT::v8i8, 1},
4903 {TTI::SK_Broadcast, MVT::v16i8, 1},
4904 {TTI::SK_Broadcast, MVT::v4i16, 1},
4905 {TTI::SK_Broadcast, MVT::v8i16, 1},
4906 {TTI::SK_Broadcast, MVT::v2i32, 1},
4907 {TTI::SK_Broadcast, MVT::v4i32, 1},
4908 {TTI::SK_Broadcast, MVT::v2i64, 1},
4909 {TTI::SK_Broadcast, MVT::v4f16, 1},
4910 {TTI::SK_Broadcast, MVT::v8f16, 1},
4911 {TTI::SK_Broadcast, MVT::v2f32, 1},
4912 {TTI::SK_Broadcast, MVT::v4f32, 1},
4913 {TTI::SK_Broadcast, MVT::v2f64, 1},
4914 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
4915 // 'zip1/zip2' instructions.
4916 {TTI::SK_Transpose, MVT::v8i8, 1},
4917 {TTI::SK_Transpose, MVT::v16i8, 1},
4918 {TTI::SK_Transpose, MVT::v4i16, 1},
4919 {TTI::SK_Transpose, MVT::v8i16, 1},
4920 {TTI::SK_Transpose, MVT::v2i32, 1},
4921 {TTI::SK_Transpose, MVT::v4i32, 1},
4922 {TTI::SK_Transpose, MVT::v2i64, 1},
4923 {TTI::SK_Transpose, MVT::v4f16, 1},
4924 {TTI::SK_Transpose, MVT::v8f16, 1},
4925 {TTI::SK_Transpose, MVT::v2f32, 1},
4926 {TTI::SK_Transpose, MVT::v4f32, 1},
4927 {TTI::SK_Transpose, MVT::v2f64, 1},
4928 // Select shuffle kinds.
4929 // TODO: handle vXi8/vXi16.
4930 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
4931 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
4932 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
4933 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
4934 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
4935 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
4936 // PermuteSingleSrc shuffle kinds.
4937 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
4938 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
4939 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
4940 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
4941 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
4942 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
4943 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
4944 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
4945 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
4946 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
4947 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
4948 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
4949 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
4950 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
4951 // Reverse can be lowered with `rev`.
4952 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
4953 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
4954 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
4955 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
4956 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
4957 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
4958 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
4959 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
4960 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
4961 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
4962 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
4963 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
4964 // Splice can all be lowered as `ext`.
4965 {TTI::SK_Splice, MVT::v2i32, 1},
4966 {TTI::SK_Splice, MVT::v4i32, 1},
4967 {TTI::SK_Splice, MVT::v2i64, 1},
4968 {TTI::SK_Splice, MVT::v2f32, 1},
4969 {TTI::SK_Splice, MVT::v4f32, 1},
4970 {TTI::SK_Splice, MVT::v2f64, 1},
4971 {TTI::SK_Splice, MVT::v8f16, 1},
4972 {TTI::SK_Splice, MVT::v8bf16, 1},
4973 {TTI::SK_Splice, MVT::v8i16, 1},
4974 {TTI::SK_Splice, MVT::v16i8, 1},
4975 {TTI::SK_Splice, MVT::v4bf16, 1},
4976 {TTI::SK_Splice, MVT::v4f16, 1},
4977 {TTI::SK_Splice, MVT::v4i16, 1},
4978 {TTI::SK_Splice, MVT::v8i8, 1},
4979 // Broadcast shuffle kinds for scalable vectors
4980 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
4981 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
4982 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
4983 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
4984 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
4985 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
4986 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
4987 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
4988 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
4989 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
4990 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
4991 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
4992 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
4993 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
4994 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
4995 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
4996 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
4997 // Handle the cases for vector.reverse with scalable vectors
4998 {TTI::SK_Reverse, MVT::nxv16i8, 1},
4999 {TTI::SK_Reverse, MVT::nxv8i16, 1},
5000 {TTI::SK_Reverse, MVT::nxv4i32, 1},
5001 {TTI::SK_Reverse, MVT::nxv2i64, 1},
5002 {TTI::SK_Reverse, MVT::nxv2f16, 1},
5003 {TTI::SK_Reverse, MVT::nxv4f16, 1},
5004 {TTI::SK_Reverse, MVT::nxv8f16, 1},
5005 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
5006 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
5007 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
5008 {TTI::SK_Reverse, MVT::nxv2f32, 1},
5009 {TTI::SK_Reverse, MVT::nxv4f32, 1},
5010 {TTI::SK_Reverse, MVT::nxv2f64, 1},
5011 {TTI::SK_Reverse, MVT::nxv16i1, 1},
5012 {TTI::SK_Reverse, MVT::nxv8i1, 1},
5013 {TTI::SK_Reverse, MVT::nxv4i1, 1},
5014 {TTI::SK_Reverse, MVT::nxv2i1, 1},
5015 };
5016 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
5017 return LT.first * Entry->Cost;
5018 }
5019
5020 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
5021 return getSpliceCost(Tp, Index);
5022
5023 // Inserting a subvector can often be done with either a D, S or H register
5024 // move, so long as the inserted vector is "aligned".
5025 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
5026 LT.second.getSizeInBits() <= 128 && SubTp) {
5027 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
5028 if (SubLT.second.isVector()) {
5029 int NumElts = LT.second.getVectorNumElements();
5030 int NumSubElts = SubLT.second.getVectorNumElements();
5031 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
5032 return SubLT.first;
5033 }
5034 }
5035
5036 // Restore optimal kind.
5037 if (IsExtractSubvector)
5039 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
5040 CxtI);
5041}
5042
5045 const auto &Strides = DenseMap<Value *, const SCEV *>();
5046 for (BasicBlock *BB : TheLoop->blocks()) {
5047 // Scan the instructions in the block and look for addresses that are
5048 // consecutive and decreasing.
5049 for (Instruction &I : *BB) {
5050 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
5052 Type *AccessTy = getLoadStoreType(&I);
5053 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
5054 /*ShouldCheckWrap=*/false)
5055 .value_or(0) < 0)
5056 return true;
5057 }
5058 }
5059 }
5060 return false;
5061}
5062
5066 return ST->useFixedOverScalableIfEqualCost();
5067}
5068
5070 return ST->getEpilogueVectorizationMinVF();
5071}
5072
5074 if (!ST->hasSVE())
5075 return false;
5076
5077 // We don't currently support vectorisation with interleaving for SVE - with
5078 // such loops we're better off not using tail-folding. This gives us a chance
5079 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
5080 if (TFI->IAI->hasGroups())
5081 return false;
5082
5084 if (TFI->LVL->getReductionVars().size())
5085 Required |= TailFoldingOpts::Reductions;
5086 if (TFI->LVL->getFixedOrderRecurrences().size())
5087 Required |= TailFoldingOpts::Recurrences;
5088
5089 // We call this to discover whether any load/store pointers in the loop have
5090 // negative strides. This will require extra work to reverse the loop
5091 // predicate, which may be expensive.
5094 Required |= TailFoldingOpts::Reverse;
5095 if (Required == TailFoldingOpts::Disabled)
5096 Required |= TailFoldingOpts::Simple;
5097
5099 Required))
5100 return false;
5101
5102 // Don't tail-fold for tight loops where we would be better off interleaving
5103 // with an unpredicated loop.
5104 unsigned NumInsns = 0;
5105 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
5106 NumInsns += BB->sizeWithoutDebug();
5107 }
5108
5109 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
5110 return NumInsns >= SVETailFoldInsnThreshold;
5111}
5112
5115 StackOffset BaseOffset, bool HasBaseReg,
5116 int64_t Scale, unsigned AddrSpace) const {
5117 // Scaling factors are not free at all.
5118 // Operands | Rt Latency
5119 // -------------------------------------------
5120 // Rt, [Xn, Xm] | 4
5121 // -------------------------------------------
5122 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
5123 // Rt, [Xn, Wm, <extend> #imm] |
5125 AM.BaseGV = BaseGV;
5126 AM.BaseOffs = BaseOffset.getFixed();
5127 AM.HasBaseReg = HasBaseReg;
5128 AM.Scale = Scale;
5129 AM.ScalableOffset = BaseOffset.getScalable();
5130 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
5131 // Scale represents reg2 * scale, thus account for 1 if
5132 // it is not equal to 0 or 1.
5133 return AM.Scale != 0 && AM.Scale != 1;
5134 return -1;
5135}
5136
5139 // For the binary operators (e.g. or) we need to be more careful than
5140 // selects, here we only transform them if they are already at a natural
5141 // break point in the code - the end of a block with an unconditional
5142 // terminator.
5143 if (I->getOpcode() == Instruction::Or &&
5144 isa<BranchInst>(I->getNextNode()) &&
5145 cast<BranchInst>(I->getNextNode())->isUnconditional())
5146 return true;
5147
5148 if (I->getOpcode() == Instruction::Add ||
5149 I->getOpcode() == Instruction::Sub)
5150 return true;
5151 }
5153}
5154
5156 const TargetTransformInfo::LSRCost &C2) {
5157 // AArch64 specific here is adding the number of instructions to the
5158 // comparison (though not as the first consideration, as some targets do)
5159 // along with changing the priority of the base additions.
5160 // TODO: Maybe a more nuanced tradeoff between instruction count
5161 // and number of registers? To be investigated at a later date.
5162 if (EnableLSRCostOpt)
5163 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
5164 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5165 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
5166 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5167
5169}
5170
5171static bool isSplatShuffle(Value *V) {
5172 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
5173 return all_equal(Shuf->getShuffleMask());
5174 return false;
5175}
5176
5177/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
5178/// or upper half of the vector elements.
5179static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
5180 bool AllowSplat = false) {
5181 // Scalable types can't be extract shuffle vectors.
5182 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
5183 return false;
5184
5185 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
5186 auto *FullTy = FullV->getType();
5187 auto *HalfTy = HalfV->getType();
5188 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
5189 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
5190 };
5191
5192 auto extractHalf = [](Value *FullV, Value *HalfV) {
5193 auto *FullVT = cast<FixedVectorType>(FullV->getType());
5194 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
5195 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
5196 };
5197
5198 ArrayRef<int> M1, M2;
5199 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
5200 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
5201 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
5202 return false;
5203
5204 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
5205 // it is not checked as an extract below.
5206 if (AllowSplat && isSplatShuffle(Op1))
5207 S1Op1 = nullptr;
5208 if (AllowSplat && isSplatShuffle(Op2))
5209 S2Op1 = nullptr;
5210
5211 // Check that the operands are half as wide as the result and we extract
5212 // half of the elements of the input vectors.
5213 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
5214 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
5215 return false;
5216
5217 // Check the mask extracts either the lower or upper half of vector
5218 // elements.
5219 int M1Start = 0;
5220 int M2Start = 0;
5221 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
5222 if ((S1Op1 &&
5223 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
5224 (S2Op1 &&
5225 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
5226 return false;
5227
5228 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
5229 (M2Start != 0 && M2Start != (NumElements / 2)))
5230 return false;
5231 if (S1Op1 && S2Op1 && M1Start != M2Start)
5232 return false;
5233
5234 return true;
5235}
5236
5237/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
5238/// of the vector elements.
5239static bool areExtractExts(Value *Ext1, Value *Ext2) {
5240 auto areExtDoubled = [](Instruction *Ext) {
5241 return Ext->getType()->getScalarSizeInBits() ==
5242 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
5243 };
5244
5245 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
5246 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
5247 !areExtDoubled(cast<Instruction>(Ext1)) ||
5248 !areExtDoubled(cast<Instruction>(Ext2)))
5249 return false;
5250
5251 return true;
5252}
5253
5254/// Check if Op could be used with vmull_high_p64 intrinsic.
5256 Value *VectorOperand = nullptr;
5257 ConstantInt *ElementIndex = nullptr;
5258 return match(Op, m_ExtractElt(m_Value(VectorOperand),
5259 m_ConstantInt(ElementIndex))) &&
5260 ElementIndex->getValue() == 1 &&
5261 isa<FixedVectorType>(VectorOperand->getType()) &&
5262 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
5263}
5264
5265/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
5266static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
5268}
5269
5271 // Restrict ourselves to the form CodeGenPrepare typically constructs.
5272 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
5273 if (!GEP || GEP->getNumOperands() != 2)
5274 return false;
5275
5276 Value *Base = GEP->getOperand(0);
5277 Value *Offsets = GEP->getOperand(1);
5278
5279 // We only care about scalar_base+vector_offsets.
5280 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
5281 return false;
5282
5283 // Sink extends that would allow us to use 32-bit offset vectors.
5284 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
5285 auto *OffsetsInst = cast<Instruction>(Offsets);
5286 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
5287 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
5288 Ops.push_back(&GEP->getOperandUse(1));
5289 }
5290
5291 // Sink the GEP.
5292 return true;
5293}
5294
5295/// We want to sink following cases:
5296/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
5297/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
5299 if (match(Op, m_VScale()))
5300 return true;
5301 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
5303 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
5304 return true;
5305 }
5306 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
5308 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
5309 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
5310 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
5311 return true;
5312 }
5313 return false;
5314}
5315
5316/// Check if sinking \p I's operands to I's basic block is profitable, because
5317/// the operands can be folded into a target instruction, e.g.
5318/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
5320 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
5321 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
5322 switch (II->getIntrinsicID()) {
5323 case Intrinsic::aarch64_neon_smull:
5324 case Intrinsic::aarch64_neon_umull:
5325 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
5326 /*AllowSplat=*/true)) {
5327 Ops.push_back(&II->getOperandUse(0));
5328 Ops.push_back(&II->getOperandUse(1));
5329 return true;
5330 }
5331 [[fallthrough]];
5332
5333 case Intrinsic::fma:
5334 case Intrinsic::fmuladd:
5335 if (isa<VectorType>(I->getType()) &&
5336 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
5337 !ST->hasFullFP16())
5338 return false;
5339 [[fallthrough]];
5340 case Intrinsic::aarch64_neon_sqdmull:
5341 case Intrinsic::aarch64_neon_sqdmulh:
5342 case Intrinsic::aarch64_neon_sqrdmulh:
5343 // Sink splats for index lane variants
5344 if (isSplatShuffle(II->getOperand(0)))
5345 Ops.push_back(&II->getOperandUse(0));
5346 if (isSplatShuffle(II->getOperand(1)))
5347 Ops.push_back(&II->getOperandUse(1));
5348 return !Ops.empty();
5349 case Intrinsic::aarch64_neon_fmlal:
5350 case Intrinsic::aarch64_neon_fmlal2:
5351 case Intrinsic::aarch64_neon_fmlsl:
5352 case Intrinsic::aarch64_neon_fmlsl2:
5353 // Sink splats for index lane variants
5354 if (isSplatShuffle(II->getOperand(1)))
5355 Ops.push_back(&II->getOperandUse(1));
5356 if (isSplatShuffle(II->getOperand(2)))
5357 Ops.push_back(&II->getOperandUse(2));
5358 return !Ops.empty();
5359 case Intrinsic::aarch64_sve_ptest_first:
5360 case Intrinsic::aarch64_sve_ptest_last:
5361 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
5362 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
5363 Ops.push_back(&II->getOperandUse(0));
5364 return !Ops.empty();
5365 case Intrinsic::aarch64_sme_write_horiz:
5366 case Intrinsic::aarch64_sme_write_vert:
5367 case Intrinsic::aarch64_sme_writeq_horiz:
5368 case Intrinsic::aarch64_sme_writeq_vert: {
5369 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
5370 if (!Idx || Idx->getOpcode() != Instruction::Add)
5371 return false;
5372 Ops.push_back(&II->getOperandUse(1));
5373 return true;
5374 }
5375 case Intrinsic::aarch64_sme_read_horiz:
5376 case Intrinsic::aarch64_sme_read_vert:
5377 case Intrinsic::aarch64_sme_readq_horiz:
5378 case Intrinsic::aarch64_sme_readq_vert:
5379 case Intrinsic::aarch64_sme_ld1b_vert:
5380 case Intrinsic::aarch64_sme_ld1h_vert:
5381 case Intrinsic::aarch64_sme_ld1w_vert:
5382 case Intrinsic::aarch64_sme_ld1d_vert:
5383 case Intrinsic::aarch64_sme_ld1q_vert:
5384 case Intrinsic::aarch64_sme_st1b_vert:
5385 case Intrinsic::aarch64_sme_st1h_vert:
5386 case Intrinsic::aarch64_sme_st1w_vert:
5387 case Intrinsic::aarch64_sme_st1d_vert:
5388 case Intrinsic::aarch64_sme_st1q_vert:
5389 case Intrinsic::aarch64_sme_ld1b_horiz:
5390 case Intrinsic::aarch64_sme_ld1h_horiz:
5391 case Intrinsic::aarch64_sme_ld1w_horiz:
5392 case Intrinsic::aarch64_sme_ld1d_horiz:
5393 case Intrinsic::aarch64_sme_ld1q_horiz:
5394 case Intrinsic::aarch64_sme_st1b_horiz:
5395 case Intrinsic::aarch64_sme_st1h_horiz:
5396 case Intrinsic::aarch64_sme_st1w_horiz:
5397 case Intrinsic::aarch64_sme_st1d_horiz:
5398 case Intrinsic::aarch64_sme_st1q_horiz: {
5399 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
5400 if (!Idx || Idx->getOpcode() != Instruction::Add)
5401 return false;
5402 Ops.push_back(&II->getOperandUse(3));
5403 return true;
5404 }
5405 case Intrinsic::aarch64_neon_pmull:
5406 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
5407 return false;
5408 Ops.push_back(&II->getOperandUse(0));
5409 Ops.push_back(&II->getOperandUse(1));
5410 return true;
5411 case Intrinsic::aarch64_neon_pmull64:
5412 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
5413 II->getArgOperand(1)))
5414 return false;
5415 Ops.push_back(&II->getArgOperandUse(0));
5416 Ops.push_back(&II->getArgOperandUse(1));
5417 return true;
5418 case Intrinsic::masked_gather:
5419 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
5420 return false;
5421 Ops.push_back(&II->getArgOperandUse(0));
5422 return true;
5423 case Intrinsic::masked_scatter:
5424 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
5425 return false;
5426 Ops.push_back(&II->getArgOperandUse(1));
5427 return true;
5428 default:
5429 return false;
5430 }
5431 }
5432
5433 auto ShouldSinkCondition = [](Value *Cond) -> bool {
5434 auto *II = dyn_cast<IntrinsicInst>(Cond);
5435 return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
5436 isa<ScalableVectorType>(II->getOperand(0)->getType());
5437 };
5438
5439 switch (I->getOpcode()) {
5440 case Instruction::GetElementPtr:
5441 case Instruction::Add:
5442 case Instruction::Sub:
5443 // Sink vscales closer to uses for better isel
5444 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
5445 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
5446 Ops.push_back(&I->getOperandUse(Op));
5447 return true;
5448 }
5449 }
5450 break;
5451 case Instruction::Select: {
5452 if (!ShouldSinkCondition(I->getOperand(0)))
5453 return false;
5454
5455 Ops.push_back(&I->getOperandUse(0));
5456 return true;
5457 }
5458 case Instruction::Br: {
5459 if (cast<BranchInst>(I)->isUnconditional())
5460 return false;
5461
5462 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition()))
5463 return false;
5464
5465 Ops.push_back(&I->getOperandUse(0));
5466 return true;
5467 }
5468 default:
5469 break;
5470 }
5471
5472 if (!I->getType()->isVectorTy())
5473 return false;
5474
5475 switch (I->getOpcode()) {
5476 case Instruction::Sub:
5477 case Instruction::Add: {
5478 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
5479 return false;
5480
5481 // If the exts' operands extract either the lower or upper elements, we
5482 // can sink them too.
5483 auto Ext1 = cast<Instruction>(I->getOperand(0));
5484 auto Ext2 = cast<Instruction>(I->getOperand(1));
5485 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
5486 Ops.push_back(&Ext1->getOperandUse(0));
5487 Ops.push_back(&Ext2->getOperandUse(0));
5488 }
5489
5490 Ops.push_back(&I->getOperandUse(0));
5491 Ops.push_back(&I->getOperandUse(1));
5492
5493 return true;
5494 }
5495 case Instruction::Or: {
5496 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
5497 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
5498 if (ST->hasNEON()) {
5499 Instruction *OtherAnd, *IA, *IB;
5500 Value *MaskValue;
5501 // MainAnd refers to And instruction that has 'Not' as one of its operands
5502 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
5503 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
5504 m_Instruction(IA)))))) {
5505 if (match(OtherAnd,
5506 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
5507 Instruction *MainAnd = I->getOperand(0) == OtherAnd
5508 ? cast<Instruction>(I->getOperand(1))
5509 : cast<Instruction>(I->getOperand(0));
5510
5511 // Both Ands should be in same basic block as Or
5512 if (I->getParent() != MainAnd->getParent() ||
5513 I->getParent() != OtherAnd->getParent())
5514 return false;
5515
5516 // Non-mask operands of both Ands should also be in same basic block
5517 if (I->getParent() != IA->getParent() ||
5518 I->getParent() != IB->getParent())
5519 return false;
5520
5521 Ops.push_back(
5522 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
5523 Ops.push_back(&I->getOperandUse(0));
5524 Ops.push_back(&I->getOperandUse(1));
5525
5526 return true;
5527 }
5528 }
5529 }
5530
5531 return false;
5532 }
5533 case Instruction::Mul: {
5534 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
5535 auto *Ty = cast<VectorType>(V->getType());
5536 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
5537 if (Ty->isScalableTy())
5538 return false;
5539
5540 // Indexed variants of Mul exist for i16 and i32 element types only.
5541 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
5542 };
5543
5544 int NumZExts = 0, NumSExts = 0;
5545 for (auto &Op : I->operands()) {
5546 // Make sure we are not already sinking this operand
5547 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
5548 continue;
5549
5550 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
5551 auto *Ext = cast<Instruction>(Op);
5552 auto *ExtOp = Ext->getOperand(0);
5553 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
5554 Ops.push_back(&Ext->getOperandUse(0));
5555 Ops.push_back(&Op);
5556
5557 if (isa<SExtInst>(Ext))
5558 NumSExts++;
5559 else
5560 NumZExts++;
5561
5562 continue;
5563 }
5564
5565 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
5566 if (!Shuffle)
5567 continue;
5568
5569 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
5570 // operand and the s/zext can help create indexed s/umull. This is
5571 // especially useful to prevent i64 mul being scalarized.
5572 if (isSplatShuffle(Shuffle) &&
5573 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
5574 Ops.push_back(&Shuffle->getOperandUse(0));
5575 Ops.push_back(&Op);
5576 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
5577 NumSExts++;
5578 else
5579 NumZExts++;
5580 continue;
5581 }
5582
5583 Value *ShuffleOperand = Shuffle->getOperand(0);
5584 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
5585 if (!Insert)
5586 continue;
5587
5588 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
5589 if (!OperandInstr)
5590 continue;
5591
5592 ConstantInt *ElementConstant =
5593 dyn_cast<ConstantInt>(Insert->getOperand(2));
5594 // Check that the insertelement is inserting into element 0
5595 if (!ElementConstant || !ElementConstant->isZero())
5596 continue;
5597
5598 unsigned Opcode = OperandInstr->getOpcode();
5599 if (Opcode == Instruction::SExt)
5600 NumSExts++;
5601 else if (Opcode == Instruction::ZExt)
5602 NumZExts++;
5603 else {
5604 // If we find that the top bits are known 0, then we can sink and allow
5605 // the backend to generate a umull.
5606 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
5607 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
5608 const DataLayout &DL = I->getDataLayout();
5609 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
5610 continue;
5611 NumZExts++;
5612 }
5613
5614 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
5615 // the And, just to hoist it again back to the load.
5616 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
5617 Ops.push_back(&Insert->getOperandUse(1));
5618 Ops.push_back(&Shuffle->getOperandUse(0));
5619 Ops.push_back(&Op);
5620 }
5621
5622 // It is profitable to sink if we found two of the same type of extends.
5623 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
5624 return true;
5625
5626 // Otherwise, see if we should sink splats for indexed variants.
5627 if (!ShouldSinkSplatForIndexedVariant(I))
5628 return false;
5629
5630 Ops.clear();
5631 if (isSplatShuffle(I->getOperand(0)))
5632 Ops.push_back(&I->getOperandUse(0));
5633 if (isSplatShuffle(I->getOperand(1)))
5634 Ops.push_back(&I->getOperandUse(1));
5635
5636 return !Ops.empty();
5637 }
5638 case Instruction::FMul: {
5639 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
5640 if (I->getType()->isScalableTy())
5641 return false;
5642
5643 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
5644 !ST->hasFullFP16())
5645 return false;
5646
5647 // Sink splats for index lane variants
5648 if (isSplatShuffle(I->getOperand(0)))
5649 Ops.push_back(&I->getOperandUse(0));
5650 if (isSplatShuffle(I->getOperand(1)))
5651 Ops.push_back(&I->getOperandUse(1));
5652 return !Ops.empty();
5653 }
5654 default:
5655 return false;
5656 }
5657 return false;
5658}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static std::optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II, int PredPos)
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE)
static std::optional< Instruction * > instCombineSVEAllActive(IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVENoActiveZero(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEAllOrNoActiveUnary(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVENoActiveReplace(InstCombiner &IC, IntrinsicInst &II, bool hasInactiveVector)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static bool hasPossibleIncompatibleOps(const Function *F)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfo::Concept conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
uint64_t Size
Hexagon Common GEP
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getVectorInsertExtractBaseCost() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned getMaxInterleaveFactor() const
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
TailFoldingOpts getSVETailFoldingDefaultOpts() const
bool useSVEForFixedLengthVectors() const
unsigned getEpilogueVectorizationMinVF() const
unsigned getMinSVEVectorSizeInBits() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
InstructionCost getSpliceCost(VectorType *Tp, int Index)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
bool shouldTreatInstructionLikeSelect(const Instruction *I)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp) const
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
unsigned getEpilogueVectorizationMinVF() const
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
uint64_t getFeatureMask(const Function &F) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isElementTypeLegalForScalableVector(Type *Ty) const
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
bool useNeonVector(const Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isMultiversionedFunction(const Function &F) const
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
bool isLegalMaskedGatherScatter(Type *DataType) const
unsigned getMaxInterleaveFactor(ElementCount VF)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
Class for arbitrary precision integers.
Definition: APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:449
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1649
unsigned countLeadingOnes() const
Definition: APInt.h:1603
void negate()
Negate this APInt in place.
Definition: APInt.h:1450
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1015
unsigned logBase2() const
Definition: APInt.h:1739
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:622
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: BasicTTIImpl.h:306
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:694
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:922
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:806
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:958
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:379
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition: InstrTypes.h:218
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1112
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:676
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:678
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:680
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:689
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
bool isIntPredicate() const
Definition: InstrTypes.h:781
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1672
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:208
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:148
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1378
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
bool empty() const
Definition: DenseMap.h:98
iterator end()
Definition: DenseMap.h:84
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition: IRBuilder.h:92
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool allowContract() const
Definition: FMF.h:70
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
bool isEquality() const
Return true if this predicate is either EQ or NE.
Value * CreateVScale(Constant *Scaling, const Twine &Name="")
Create a call to llvm.vscale, multiplied by Scaling.
Definition: IRBuilder.cpp:89
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2511
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2562
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1080
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2499
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition: IRBuilder.h:578
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1163
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:546
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1053
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:545
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition: IRBuilder.h:563
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:550
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1874
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:900
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2234
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2435
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1677
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2152
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1798
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2533
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1811
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:566
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition: IRBuilder.h:573
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2225
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
This instruction inserts a single (scalar) element into a VectorType value.
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
The core instruction combiner logic.
Definition: InstCombiner.h:48
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:388
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:412
BuilderTy & Builder
Definition: InstCombiner.h:61
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:310
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
bool hasGroups() const
Returns true if we have any interleave groups.
Definition: VectorUtils.h:694
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:55
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition: MapVector.h:60
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:77
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool requiresSMChange(const SMEAttrs &Callee) const
void set(unsigned M, bool Enable=true)
bool hasStreamingBody() const
bool isNewZA() const
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:812
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
Definition: DerivedTypes.h:660
The main scalar evolution driver.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
This instruction constructs a fixed permutation of two input vectors.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition: SmallPtrSet.h:94
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
An instruction for storing to memory.
Definition: Instructions.h:292
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:700
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:43
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:68
R Default(T Value)
Definition: StringSwitch.h:177
Class to represent struct types.
Definition: DerivedTypes.h:218
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
bool shouldTreatInstructionLikeSelect(const Instruction *I)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
bool isLoweredToCall(const Function *F) const
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:162
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
const Use & getOperandUse(unsigned i) const
Definition: User.h:241
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:946
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:674
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
int getNumOccurrences() const
Definition: CommandLine.h:399
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition: TypeSize.h:174
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
const ParentTy * getParent() const
Definition: ilist_node.h:32
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
uint64_t getFMVPriority(ArrayRef< StringRef > Features)
static constexpr unsigned SVEBitsPerBlock
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:756
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
Definition: PatternMatch.h:560
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:592
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:931
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
VScaleVal_match m_VScale()
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:463
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
Definition: LoopInfo.cpp:1077
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:297
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition: VE.h:376
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:341
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
constexpr int PoisonMaskElem
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ Mod
The access may modify the value stored in memory.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FAnyOf
Any_of reduction with select(fcmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ IAnyOf
Any_of reduction with select(icmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2087
InstructionCost Cost
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:383
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:279
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:289
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool isFixedLengthVector() const
Definition: ValueTypes.h:181
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Type Conversion Cost Table.
Definition: CostTable.h:55