LLVM 22.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "ARMSubtarget.h"
12#include "llvm/ADT/APInt.h"
19#include "llvm/IR/BasicBlock.h"
20#include "llvm/IR/DataLayout.h"
22#include "llvm/IR/Instruction.h"
25#include "llvm/IR/Intrinsics.h"
26#include "llvm/IR/IntrinsicsARM.h"
28#include "llvm/IR/Type.h"
37#include <algorithm>
38#include <cassert>
39#include <cstdint>
40#include <optional>
41#include <utility>
42
43using namespace llvm;
44
45#define DEBUG_TYPE "armtti"
46
48 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
49 cl::desc("Enable the generation of masked loads and stores"));
50
52 "disable-arm-loloops", cl::Hidden, cl::init(false),
53 cl::desc("Disable the generation of low-overhead loops"));
54
55static cl::opt<bool>
56 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
57 cl::desc("Enable the generation of WLS loops"));
58
60 "widen-global-strings", cl::Hidden, cl::init(true),
61 cl::desc("Enable the widening of global strings to alignment boundaries"));
62
64
66
68
69/// Convert a vector load intrinsic into a simple llvm load instruction.
70/// This is beneficial when the underlying object being addressed comes
71/// from a constant, since we get constant-folding for free.
72static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
73 InstCombiner::BuilderTy &Builder) {
74 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
75
76 if (!IntrAlign)
77 return nullptr;
78
79 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
80 ? MemAlign
81 : IntrAlign->getLimitedValue();
82
83 if (!isPowerOf2_32(Alignment))
84 return nullptr;
85
86 return Builder.CreateAlignedLoad(II.getType(), II.getArgOperand(0),
87 Align(Alignment));
88}
89
91 const Function *Callee) const {
92 const TargetMachine &TM = getTLI()->getTargetMachine();
93 const FeatureBitset &CallerBits =
94 TM.getSubtargetImpl(*Caller)->getFeatureBits();
95 const FeatureBitset &CalleeBits =
96 TM.getSubtargetImpl(*Callee)->getFeatureBits();
97
98 // To inline a callee, all features not in the allowed list must match exactly.
99 bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
100 (CalleeBits & ~InlineFeaturesAllowed);
101 // For features in the allowed list, the callee's features must be a subset of
102 // the callers'.
103 bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
104 (CalleeBits & InlineFeaturesAllowed);
105 return MatchExact && MatchSubset;
106}
107
110 ScalarEvolution *SE) const {
111 if (ST->hasMVEIntegerOps())
113
114 if (L->getHeader()->getParent()->hasOptSize())
115 return TTI::AMK_None;
116
117 if (ST->isMClass() && ST->isThumb2() &&
118 L->getNumBlocks() == 1)
119 return TTI::AMK_PreIndexed;
120
121 return TTI::AMK_None;
122}
123
124std::optional<Instruction *>
126 using namespace PatternMatch;
127 Intrinsic::ID IID = II.getIntrinsicID();
128 switch (IID) {
129 default:
130 break;
131 case Intrinsic::arm_neon_vld1: {
132 Align MemAlign =
133 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
135 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
136 return IC.replaceInstUsesWith(II, V);
137 }
138 break;
139 }
140
141 case Intrinsic::arm_neon_vld2:
142 case Intrinsic::arm_neon_vld3:
143 case Intrinsic::arm_neon_vld4:
144 case Intrinsic::arm_neon_vld2lane:
145 case Intrinsic::arm_neon_vld3lane:
146 case Intrinsic::arm_neon_vld4lane:
147 case Intrinsic::arm_neon_vst1:
148 case Intrinsic::arm_neon_vst2:
149 case Intrinsic::arm_neon_vst3:
150 case Intrinsic::arm_neon_vst4:
151 case Intrinsic::arm_neon_vst2lane:
152 case Intrinsic::arm_neon_vst3lane:
153 case Intrinsic::arm_neon_vst4lane: {
154 Align MemAlign =
155 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
157 unsigned AlignArg = II.arg_size() - 1;
158 Value *AlignArgOp = II.getArgOperand(AlignArg);
159 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
160 if (Align && *Align < MemAlign) {
161 return IC.replaceOperand(
162 II, AlignArg,
163 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
164 false));
165 }
166 break;
167 }
168
169 case Intrinsic::arm_neon_vld1x2:
170 case Intrinsic::arm_neon_vld1x3:
171 case Intrinsic::arm_neon_vld1x4:
172 case Intrinsic::arm_neon_vst1x2:
173 case Intrinsic::arm_neon_vst1x3:
174 case Intrinsic::arm_neon_vst1x4: {
175 Align NewAlign =
176 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
178 Align OldAlign = II.getParamAlign(0).valueOrOne();
179 if (NewAlign > OldAlign)
180 II.addParamAttr(0,
181 Attribute::getWithAlignment(II.getContext(), NewAlign));
182 break;
183 }
184
185 case Intrinsic::arm_mve_pred_i2v: {
186 Value *Arg = II.getArgOperand(0);
187 Value *ArgArg;
188 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
189 PatternMatch::m_Value(ArgArg))) &&
190 II.getType() == ArgArg->getType()) {
191 return IC.replaceInstUsesWith(II, ArgArg);
192 }
193 Constant *XorMask;
194 if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
195 PatternMatch::m_Value(ArgArg)),
196 PatternMatch::m_Constant(XorMask))) &&
197 II.getType() == ArgArg->getType()) {
198 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
199 if (CI->getValue().trunc(16).isAllOnes()) {
200 auto TrueVector = IC.Builder.CreateVectorSplat(
201 cast<FixedVectorType>(II.getType())->getNumElements(),
202 IC.Builder.getTrue());
203 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
204 }
205 }
206 }
207 KnownBits ScalarKnown(32);
208 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
209 ScalarKnown)) {
210 return &II;
211 }
212 break;
213 }
214 case Intrinsic::arm_mve_pred_v2i: {
215 Value *Arg = II.getArgOperand(0);
216 Value *ArgArg;
217 if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
218 PatternMatch::m_Value(ArgArg)))) {
219 return IC.replaceInstUsesWith(II, ArgArg);
220 }
221
222 if (II.getMetadata(LLVMContext::MD_range))
223 break;
224
225 ConstantRange Range(APInt(32, 0), APInt(32, 0x10000));
226
227 if (auto CurrentRange = II.getRange()) {
228 Range = Range.intersectWith(*CurrentRange);
229 if (Range == CurrentRange)
230 break;
231 }
232
233 II.addRangeRetAttr(Range);
234 II.addRetAttr(Attribute::NoUndef);
235 return &II;
236 }
237 case Intrinsic::arm_mve_vadc:
238 case Intrinsic::arm_mve_vadc_predicated: {
239 unsigned CarryOp =
240 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
241 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
242 "Bad type for intrinsic!");
243
244 KnownBits CarryKnown(32);
245 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
246 CarryKnown)) {
247 return &II;
248 }
249 break;
250 }
251 case Intrinsic::arm_mve_vmldava: {
252 Instruction *I = cast<Instruction>(&II);
253 if (I->hasOneUse()) {
254 auto *User = cast<Instruction>(*I->user_begin());
255 Value *OpZ;
256 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
257 match(I->getOperand(3), m_Zero())) {
258 Value *OpX = I->getOperand(4);
259 Value *OpY = I->getOperand(5);
260 Type *OpTy = OpX->getType();
261
263 Value *V =
264 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
265 {I->getOperand(0), I->getOperand(1),
266 I->getOperand(2), OpZ, OpX, OpY});
267
269 return IC.eraseInstFromFunction(*User);
270 }
271 }
272 return std::nullopt;
273 }
274 }
275 return std::nullopt;
276}
277
279 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
280 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
281 std::function<void(Instruction *, unsigned, APInt, APInt &)>
282 SimplifyAndSetOp) const {
283
284 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
285 // opcode specifying a Top/Bottom instruction, which can change between
286 // instructions.
287 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
288 unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
289 unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
290
291 // The only odd/even lanes of operand 0 will only be demanded depending
292 // on whether this is a top/bottom instruction.
293 APInt DemandedElts =
294 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
295 : APInt::getHighBitsSet(2, 1));
296 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
297 // The other lanes will be defined from the inserted elements.
298 UndefElts &= APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
299 : APInt::getHighBitsSet(2, 1));
300 return std::nullopt;
301 };
302
303 switch (II.getIntrinsicID()) {
304 default:
305 break;
306 case Intrinsic::arm_mve_vcvt_narrow:
307 SimplifyNarrowInstrTopBottom(2);
308 break;
309 case Intrinsic::arm_mve_vqmovn:
310 SimplifyNarrowInstrTopBottom(4);
311 break;
312 case Intrinsic::arm_mve_vshrn:
313 SimplifyNarrowInstrTopBottom(7);
314 break;
315 }
316
317 return std::nullopt;
318}
319
322 assert(Ty->isIntegerTy());
323
324 unsigned Bits = Ty->getPrimitiveSizeInBits();
325 if (Bits == 0 || Imm.getActiveBits() >= 64)
326 return 4;
327
328 int64_t SImmVal = Imm.getSExtValue();
329 uint64_t ZImmVal = Imm.getZExtValue();
330 if (!ST->isThumb()) {
331 if ((SImmVal >= 0 && SImmVal < 65536) ||
332 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
333 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
334 return 1;
335 return ST->hasV6T2Ops() ? 2 : 3;
336 }
337 if (ST->isThumb2()) {
338 if ((SImmVal >= 0 && SImmVal < 65536) ||
339 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
340 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
341 return 1;
342 return ST->hasV6T2Ops() ? 2 : 3;
343 }
344 // Thumb1, any i8 imm cost 1.
345 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
346 return 1;
347 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
348 return 2;
349 // Load from constantpool.
350 return 3;
351}
352
353// Constants smaller than 256 fit in the immediate field of
354// Thumb1 instructions so we return a zero cost and 1 otherwise.
356 const APInt &Imm,
357 Type *Ty) const {
358 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
359 return 0;
360
361 return 1;
362}
363
364// Checks whether Inst is part of a min(max()) or max(min()) pattern
365// that will match to an SSAT instruction. Returns the instruction being
366// saturated, or null if no saturation pattern was found.
367static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
368 Value *LHS, *RHS;
369 ConstantInt *C;
371
372 if (InstSPF == SPF_SMAX &&
374 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
375
376 auto isSSatMin = [&](Value *MinInst) {
377 if (isa<SelectInst>(MinInst)) {
378 Value *MinLHS, *MinRHS;
379 ConstantInt *MinC;
380 SelectPatternFlavor MinSPF =
381 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
382 if (MinSPF == SPF_SMIN &&
384 MinC->getValue() == ((-Imm) - 1))
385 return true;
386 }
387 return false;
388 };
389
390 if (isSSatMin(Inst->getOperand(1)))
391 return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
392 if (Inst->hasNUses(2) &&
393 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
394 return Inst->getOperand(1);
395 }
396 return nullptr;
397}
398
399// Look for a FP Saturation pattern, where the instruction can be simplified to
400// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
401static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
402 if (Imm.getBitWidth() != 64 ||
403 Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
404 return false;
405 Value *FP = isSSATMinMaxPattern(Inst, Imm);
406 if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
407 FP = isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm);
408 if (!FP)
409 return false;
410 return isa<FPToSIInst>(FP);
411}
412
414 const APInt &Imm, Type *Ty,
416 Instruction *Inst) const {
417 // Division by a constant can be turned into multiplication, but only if we
418 // know it's constant. So it's not so much that the immediate is cheap (it's
419 // not), but that the alternative is worse.
420 // FIXME: this is probably unneeded with GlobalISel.
421 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
422 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
423 Idx == 1)
424 return 0;
425
426 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
427 // splitting any large offsets.
428 if (Opcode == Instruction::GetElementPtr && Idx != 0)
429 return 0;
430
431 if (Opcode == Instruction::And) {
432 // UXTB/UXTH
433 if (Imm == 255 || Imm == 65535)
434 return 0;
435 // Conversion to BIC is free, and means we can use ~Imm instead.
436 return std::min(getIntImmCost(Imm, Ty, CostKind),
437 getIntImmCost(~Imm, Ty, CostKind));
438 }
439
440 if (Opcode == Instruction::Add)
441 // Conversion to SUB is free, and means we can use -Imm instead.
442 return std::min(getIntImmCost(Imm, Ty, CostKind),
443 getIntImmCost(-Imm, Ty, CostKind));
444
445 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
446 Ty->getIntegerBitWidth() == 32) {
447 int64_t NegImm = -Imm.getSExtValue();
448 if (ST->isThumb2() && NegImm < 1<<12)
449 // icmp X, #-C -> cmn X, #C
450 return 0;
451 if (ST->isThumb() && NegImm < 1<<8)
452 // icmp X, #-C -> adds X, #C
453 return 0;
454 }
455
456 // xor a, -1 can always be folded to MVN
457 if (Opcode == Instruction::Xor && Imm.isAllOnes())
458 return 0;
459
460 // Ensures negative constant of min(max()) or max(min()) patterns that
461 // match to SSAT instructions don't get hoisted
462 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
463 Ty->getIntegerBitWidth() <= 32) {
464 if (isSSATMinMaxPattern(Inst, Imm) ||
465 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
466 isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm)))
467 return 0;
468 }
469
470 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
471 return 0;
472
473 // We can convert <= -1 to < 0, which is generally quite cheap.
474 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
475 ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
476 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
477 return std::min(getIntImmCost(Imm, Ty, CostKind),
478 getIntImmCost(Imm + 1, Ty, CostKind));
479 }
480
481 return getIntImmCost(Imm, Ty, CostKind);
482}
483
486 const Instruction *I) const {
488 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
489 // FIXME: The vectorizer is highly sensistive to the cost of these
490 // instructions, which suggests that it may be using the costs incorrectly.
491 // But, for now, just make them free to avoid performance regressions for
492 // vector targets.
493 return 0;
494 }
495 return BaseT::getCFInstrCost(Opcode, CostKind, I);
496}
497
499 Type *Src,
502 const Instruction *I) const {
503 int ISD = TLI->InstructionOpcodeToISD(Opcode);
504 assert(ISD && "Invalid opcode");
505
506 // TODO: Allow non-throughput costs that aren't binary.
507 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
509 return Cost == 0 ? 0 : 1;
510 return Cost;
511 };
512 auto IsLegalFPType = [this](EVT VT) {
513 EVT EltVT = VT.getScalarType();
514 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
515 (EltVT == MVT::f64 && ST->hasFP64()) ||
516 (EltVT == MVT::f16 && ST->hasFullFP16());
517 };
518
519 EVT SrcTy = TLI->getValueType(DL, Src);
520 EVT DstTy = TLI->getValueType(DL, Dst);
521
522 if (!SrcTy.isSimple() || !DstTy.isSimple())
523 return AdjustCost(
524 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
525
526 // Extending masked load/Truncating masked stores is expensive because we
527 // currently don't split them. This means that we'll likely end up
528 // loading/storing each element individually (hence the high cost).
529 if ((ST->hasMVEIntegerOps() &&
530 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
531 Opcode == Instruction::SExt)) ||
532 (ST->hasMVEFloatOps() &&
533 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
534 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
535 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
536 return 2 * DstTy.getVectorNumElements() *
538
539 // The extend of other kinds of load is free
540 if (CCH == TTI::CastContextHint::Normal ||
542 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
543 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
544 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
545 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
546 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
547 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
548 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
549 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
550 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
551 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
552 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
553 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
554 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
555 };
556 if (const auto *Entry = ConvertCostTableLookup(
557 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
558 return AdjustCost(Entry->Cost);
559
560 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
561 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
562 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
563 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
564 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
565 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
566 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
567 // The following extend from a legal type to an illegal type, so need to
568 // split the load. This introduced an extra load operation, but the
569 // extend is still "free".
570 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
571 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
572 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
573 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
574 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
575 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
576 };
577 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
578 if (const auto *Entry =
579 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
580 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
581 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
582 }
583
584 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
585 // FPExtends are similar but also require the VCVT instructions.
586 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
587 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
588 };
589 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
590 if (const auto *Entry =
591 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
592 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
593 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
594 }
595
596 // The truncate of a store is free. This is the mirror of extends above.
597 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
598 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
599 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
600 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
601 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
602 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
603 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
604 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
605 };
606 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
607 if (const auto *Entry =
608 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
609 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
610 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
611 }
612
613 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
614 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
615 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
616 };
617 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
618 if (const auto *Entry =
619 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
620 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
621 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
622 }
623 }
624
625 // NEON vector operations that can extend their inputs.
626 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
627 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
628 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
629 // vaddl
630 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
631 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
632 // vsubl
633 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
634 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
635 // vmull
636 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
637 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
638 // vshll
639 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
640 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
641 };
642
643 auto *User = cast<Instruction>(*I->user_begin());
644 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
645 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
646 DstTy.getSimpleVT(),
647 SrcTy.getSimpleVT())) {
648 return AdjustCost(Entry->Cost);
649 }
650 }
651
652 // Single to/from double precision conversions.
653 if (Src->isVectorTy() && ST->hasNEON() &&
654 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
655 DstTy.getScalarType() == MVT::f32) ||
656 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
657 DstTy.getScalarType() == MVT::f64))) {
658 static const CostTblEntry NEONFltDblTbl[] = {
659 // Vector fptrunc/fpext conversions.
660 {ISD::FP_ROUND, MVT::v2f64, 2},
661 {ISD::FP_EXTEND, MVT::v2f32, 2},
662 {ISD::FP_EXTEND, MVT::v4f32, 4}};
663
664 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
665 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
666 return AdjustCost(LT.first * Entry->Cost);
667 }
668
669 // Some arithmetic, load and store operations have specific instructions
670 // to cast up/down their types automatically at no extra cost.
671 // TODO: Get these tables to know at least what the related operations are.
672 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
673 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
674 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
675 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
676 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
677 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
678 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
679
680 // The number of vmovl instructions for the extension.
681 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
682 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
683 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
684 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
685 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
686 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
687 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
688 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
689 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
690 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
691 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
692 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
693 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
694 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
695 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
696 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
697 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
698 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
699
700 // Operations that we legalize using splitting.
701 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
702 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
703
704 // Vector float <-> i32 conversions.
705 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
706 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
707
708 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
709 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
710 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
711 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
712 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
713 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
714 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
715 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
716 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
717 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
718 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
719 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
720 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
721 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
722 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
723 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
724 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
725 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
726 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
727 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
728
729 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
730 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
731 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
732 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
733 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
734 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
735
736 // Vector double <-> i32 conversions.
737 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
738 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
739
740 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
741 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
742 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
743 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
744 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
745 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
746
747 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
748 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
749 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
750 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
751 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
752 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
753 };
754
755 if (SrcTy.isVector() && ST->hasNEON()) {
756 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
757 DstTy.getSimpleVT(),
758 SrcTy.getSimpleVT()))
759 return AdjustCost(Entry->Cost);
760 }
761
762 // Scalar float to integer conversions.
763 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
764 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
765 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
766 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
767 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
768 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
769 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
770 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
771 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
772 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
773 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
774 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
775 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
776 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
777 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
778 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
779 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
780 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
781 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
782 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
783 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
784 };
785 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
786 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
787 DstTy.getSimpleVT(),
788 SrcTy.getSimpleVT()))
789 return AdjustCost(Entry->Cost);
790 }
791
792 // Scalar integer to float conversions.
793 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
794 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
795 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
796 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
797 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
798 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
799 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
800 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
801 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
802 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
803 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
804 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
805 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
806 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
807 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
808 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
809 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
810 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
811 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
812 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
813 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
814 };
815
816 if (SrcTy.isInteger() && ST->hasNEON()) {
817 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
818 ISD, DstTy.getSimpleVT(),
819 SrcTy.getSimpleVT()))
820 return AdjustCost(Entry->Cost);
821 }
822
823 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
824 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
825 // are linearised so take more.
826 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
827 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
828 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
829 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
830 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
831 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
832 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
833 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
834 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
835 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
836 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
837 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
838 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
839 };
840
841 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
842 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
843 ISD, DstTy.getSimpleVT(),
844 SrcTy.getSimpleVT()))
845 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
846 }
847
848 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
849 // As general rule, fp converts that were not matched above are scalarized
850 // and cost 1 vcvt for each lane, so long as the instruction is available.
851 // If not it will become a series of function calls.
852 const InstructionCost CallCost =
853 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
854 int Lanes = 1;
855 if (SrcTy.isFixedLengthVector())
856 Lanes = SrcTy.getVectorNumElements();
857
858 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
859 return Lanes;
860 else
861 return Lanes * CallCost;
862 }
863
864 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
865 SrcTy.isFixedLengthVector()) {
866 // Treat a truncate with larger than legal source (128bits for MVE) as
867 // expensive, 2 instructions per lane.
868 if ((SrcTy.getScalarType() == MVT::i8 ||
869 SrcTy.getScalarType() == MVT::i16 ||
870 SrcTy.getScalarType() == MVT::i32) &&
871 SrcTy.getSizeInBits() > 128 &&
872 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
873 return SrcTy.getVectorNumElements() * 2;
874 }
875
876 // Scalar integer conversion costs.
877 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
878 // i16 -> i64 requires two dependent operations.
879 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
880
881 // Truncates on i64 are assumed to be free.
882 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
883 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
884 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
885 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
886 };
887
888 if (SrcTy.isInteger()) {
889 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
890 DstTy.getSimpleVT(),
891 SrcTy.getSimpleVT()))
892 return AdjustCost(Entry->Cost);
893 }
894
895 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
897 : 1;
898 return AdjustCost(
899 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
900}
901
904 unsigned Index, const Value *Op0,
905 const Value *Op1) const {
906 // Penalize inserting into an D-subregister. We end up with a three times
907 // lower estimated throughput on swift.
908 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
909 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
910 return 3;
911
912 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
913 Opcode == Instruction::ExtractElement)) {
914 // Cross-class copies are expensive on many microarchitectures,
915 // so assume they are expensive by default.
916 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
917 return 3;
918
919 // Even if it's not a cross class copy, this likely leads to mixing
920 // of NEON and VFP code and should be therefore penalized.
921 if (ValTy->isVectorTy() &&
922 ValTy->getScalarSizeInBits() <= 32)
923 return std::max<InstructionCost>(
924 BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1),
925 2U);
926 }
927
928 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
929 Opcode == Instruction::ExtractElement)) {
930 // Integer cross-lane moves are more expensive than float, which can
931 // sometimes just be vmovs. Integer involve being passes to GPR registers,
932 // causing more of a delay.
933 std::pair<InstructionCost, MVT> LT =
935 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
936 }
937
938 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
939}
940
942 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
944 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
945 int ISD = TLI->InstructionOpcodeToISD(Opcode);
946
947 // Thumb scalar code size cost for select.
948 if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT &&
949 ST->isThumb() && !ValTy->isVectorTy()) {
950 // Assume expensive structs.
951 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
952 return TTI::TCC_Expensive;
953
954 // Select costs can vary because they:
955 // - may require one or more conditional mov (including an IT),
956 // - can't operate directly on immediates,
957 // - require live flags, which we can't copy around easily.
959
960 // Possible IT instruction for Thumb2, or more for Thumb1.
961 ++Cost;
962
963 // i1 values may need rematerialising by using mov immediates and/or
964 // flag setting instructions.
965 if (ValTy->isIntegerTy(1))
966 ++Cost;
967
968 return Cost;
969 }
970
971 // If this is a vector min/max/abs, use the cost of that intrinsic directly
972 // instead. Hopefully when min/max intrinsics are more prevalent this code
973 // will not be needed.
974 const Instruction *Sel = I;
975 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
976 Sel->hasOneUse())
977 Sel = cast<Instruction>(Sel->user_back());
978 if (Sel && ValTy->isVectorTy() &&
979 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
980 const Value *LHS, *RHS;
982 unsigned IID = 0;
983 switch (SPF) {
984 case SPF_ABS:
985 IID = Intrinsic::abs;
986 break;
987 case SPF_SMIN:
988 IID = Intrinsic::smin;
989 break;
990 case SPF_SMAX:
991 IID = Intrinsic::smax;
992 break;
993 case SPF_UMIN:
994 IID = Intrinsic::umin;
995 break;
996 case SPF_UMAX:
997 IID = Intrinsic::umax;
998 break;
999 case SPF_FMINNUM:
1000 IID = Intrinsic::minnum;
1001 break;
1002 case SPF_FMAXNUM:
1003 IID = Intrinsic::maxnum;
1004 break;
1005 default:
1006 break;
1007 }
1008 if (IID) {
1009 // The ICmp is free, the select gets the cost of the min/max/etc
1010 if (Sel != I)
1011 return 0;
1012 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
1013 return getIntrinsicInstrCost(CostAttrs, CostKind);
1014 }
1015 }
1016
1017 // On NEON a vector select gets lowered to vbsl.
1018 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
1019 // Lowering of some vector selects is currently far from perfect.
1020 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
1021 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
1022 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
1023 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
1024 };
1025
1026 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1027 EVT SelValTy = TLI->getValueType(DL, ValTy);
1028 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1029 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1030 SelCondTy.getSimpleVT(),
1031 SelValTy.getSimpleVT()))
1032 return Entry->Cost;
1033 }
1034
1035 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1036 return LT.first;
1037 }
1038
1039 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1040 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1041 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1042 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1043 FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy);
1044 if (!VecCondTy)
1045 VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy));
1046
1047 // If we don't have mve.fp any fp operations will need to be scalarized.
1048 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1049 // One scalaization insert, one scalarization extract and the cost of the
1050 // fcmps.
1051 return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
1052 /*Extract*/ true, CostKind) +
1053 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1054 /*Extract*/ false, CostKind) +
1055 VecValTy->getNumElements() *
1056 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1057 VecCondTy->getScalarType(), VecPred,
1058 CostKind, Op1Info, Op2Info, I);
1059 }
1060
1061 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1062 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1063 // There are two types - the input that specifies the type of the compare
1064 // and the output vXi1 type. Because we don't know how the output will be
1065 // split, we may need an expensive shuffle to get two in sync. This has the
1066 // effect of making larger than legal compares (v8i32 for example)
1067 // expensive.
1068 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1069 if (LT.first > 1)
1070 return LT.first * BaseCost +
1071 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1072 /*Extract*/ false, CostKind);
1073 return BaseCost;
1074 }
1075 }
1076
1077 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1078 // for "multiple beats" potentially needed by MVE instructions.
1079 int BaseCost = 1;
1080 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1081 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1082
1083 return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred,
1084 CostKind, Op1Info, Op2Info, I);
1085}
1086
1089 const SCEV *Ptr,
1091 // Address computations in vectorized code with non-consecutive addresses will
1092 // likely result in more instructions compared to scalar code where the
1093 // computation can more often be merged into the index mode. The resulting
1094 // extra micro-ops can significantly decrease throughput.
1095 unsigned NumVectorInstToHideOverhead = 10;
1096 int MaxMergeDistance = 64;
1097
1098 if (ST->hasNEON()) {
1099 if (PtrTy->isVectorTy() && SE &&
1100 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1101 return NumVectorInstToHideOverhead;
1102
1103 // In many cases the address computation is not merged into the instruction
1104 // addressing mode.
1105 return 1;
1106 }
1107 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1108}
1109
1111 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1112 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1113 // optimized, else LSR may block tail-predication.
1114 switch (II->getIntrinsicID()) {
1115 case Intrinsic::arm_mve_vctp8:
1116 case Intrinsic::arm_mve_vctp16:
1117 case Intrinsic::arm_mve_vctp32:
1118 case Intrinsic::arm_mve_vctp64:
1119 return true;
1120 default:
1121 break;
1122 }
1123 }
1124 return false;
1125}
1126
1128 unsigned /*AddressSpace*/) const {
1129 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1130 return false;
1131
1132 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1133 // Don't support v2i1 yet.
1134 if (VecTy->getNumElements() == 2)
1135 return false;
1136
1137 // We don't support extending fp types.
1138 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1139 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1140 return false;
1141 }
1142
1143 unsigned EltWidth = DataTy->getScalarSizeInBits();
1144 return (EltWidth == 32 && Alignment >= 4) ||
1145 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1146}
1147
1148bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const {
1149 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1150 return false;
1151
1152 unsigned EltWidth = Ty->getScalarSizeInBits();
1153 return ((EltWidth == 32 && Alignment >= 4) ||
1154 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1155}
1156
1157/// Given a memcpy/memset/memmove instruction, return the number of memory
1158/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1159/// call is used.
1161 MemOp MOp;
1162 unsigned DstAddrSpace = ~0u;
1163 unsigned SrcAddrSpace = ~0u;
1164 const Function *F = I->getParent()->getParent();
1165
1166 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1167 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1168 // If 'size' is not a constant, a library call will be generated.
1169 if (!C)
1170 return -1;
1171
1172 const unsigned Size = C->getValue().getZExtValue();
1173 const Align DstAlign = MC->getDestAlign().valueOrOne();
1174 const Align SrcAlign = MC->getSourceAlign().valueOrOne();
1175
1176 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1177 /*IsVolatile*/ false);
1178 DstAddrSpace = MC->getDestAddressSpace();
1179 SrcAddrSpace = MC->getSourceAddressSpace();
1180 }
1181 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1182 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1183 // If 'size' is not a constant, a library call will be generated.
1184 if (!C)
1185 return -1;
1186
1187 const unsigned Size = C->getValue().getZExtValue();
1188 const Align DstAlign = MS->getDestAlign().valueOrOne();
1189
1190 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1191 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1192 DstAddrSpace = MS->getDestAddressSpace();
1193 }
1194 else
1195 llvm_unreachable("Expected a memcpy/move or memset!");
1196
1197 unsigned Limit, Factor = 2;
1198 switch(I->getIntrinsicID()) {
1199 case Intrinsic::memcpy:
1200 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1201 break;
1202 case Intrinsic::memmove:
1203 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1204 break;
1205 case Intrinsic::memset:
1206 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1207 Factor = 1;
1208 break;
1209 default:
1210 llvm_unreachable("Expected a memcpy/move or memset!");
1211 }
1212
1213 // MemOps will be poplulated with a list of data types that needs to be
1214 // loaded and stored. That's why we multiply the number of elements by 2 to
1215 // get the cost for this memcpy.
1216 std::vector<EVT> MemOps;
1217 LLVMContext &C = F->getContext();
1218 if (getTLI()->findOptimalMemOpLowering(C, MemOps, Limit, MOp, DstAddrSpace,
1219 SrcAddrSpace, F->getAttributes()))
1220 return MemOps.size() * Factor;
1221
1222 // If we can't find an optimal memop lowering, return the default cost
1223 return -1;
1224}
1225
1227 int NumOps = getNumMemOps(cast<IntrinsicInst>(I));
1228
1229 // To model the cost of a library call, we assume 1 for the call, and
1230 // 3 for the argument setup.
1231 if (NumOps == -1)
1232 return 4;
1233 return NumOps;
1234}
1235
1237 VectorType *DstTy, VectorType *SrcTy,
1238 ArrayRef<int> Mask,
1240 int Index, VectorType *SubTp,
1242 const Instruction *CxtI) const {
1243 assert((Mask.empty() || DstTy->isScalableTy() ||
1244 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1245 "Expected the Mask to match the return size if given");
1246 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1247 "Expected the same scalar types");
1248
1249 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1250 // Treat extractsubvector as single op permutation.
1251 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1252 if (IsExtractSubvector)
1254 if (ST->hasNEON()) {
1255 if (Kind == TTI::SK_Broadcast) {
1256 static const CostTblEntry NEONDupTbl[] = {
1257 // VDUP handles these cases.
1258 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1259 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1260 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1261 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1262 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1263 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1264
1265 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1266 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1267 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1268 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1269
1270 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1271 if (const auto *Entry =
1272 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1273 return LT.first * Entry->Cost;
1274 }
1275 if (Kind == TTI::SK_Reverse) {
1276 static const CostTblEntry NEONShuffleTbl[] = {
1277 // Reverse shuffle cost one instruction if we are shuffling within a
1278 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1279 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1280 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1281 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1282 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1283 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1284 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1285
1286 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1287 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1288 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1289 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1290
1291 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1292 if (const auto *Entry =
1293 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1294 return LT.first * Entry->Cost;
1295 }
1296 if (Kind == TTI::SK_Select) {
1297 static const CostTblEntry NEONSelShuffleTbl[] = {
1298 // Select shuffle cost table for ARM. Cost is the number of
1299 // instructions
1300 // required to create the shuffled vector.
1301
1302 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1303 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1304 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1305 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1306
1307 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1308 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1309 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1310
1311 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1312
1313 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1314
1315 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1316 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1317 ISD::VECTOR_SHUFFLE, LT.second))
1318 return LT.first * Entry->Cost;
1319 }
1320 }
1321 if (ST->hasMVEIntegerOps()) {
1322 if (Kind == TTI::SK_Broadcast) {
1323 static const CostTblEntry MVEDupTbl[] = {
1324 // VDUP handles these cases.
1325 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1326 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1327 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1328 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1329 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1330
1331 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1332 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1333 LT.second))
1334 return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
1335 }
1336
1337 if (!Mask.empty()) {
1338 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1339 // Check for LD2/LD4 instructions, which are represented in llvm IR as
1340 // deinterleaving-shuffle(load). The shuffle cost could potentially be
1341 // free, but we model it with a cost of LT.first so that LD2/LD4 have a
1342 // higher cost than just the load.
1343 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
1344 (LT.second.getScalarSizeInBits() == 8 ||
1345 LT.second.getScalarSizeInBits() == 16 ||
1346 LT.second.getScalarSizeInBits() == 32) &&
1347 LT.second.getSizeInBits() == 128 &&
1348 ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
1350 (TLI->getMaxSupportedInterleaveFactor() == 4 &&
1352 return ST->getMVEVectorCostFactor(CostKind) *
1353 std::max<InstructionCost>(1, LT.first / 4);
1354
1355 // Check for ST2/ST4 instructions, which are represented in llvm IR as
1356 // store(interleaving-shuffle). The shuffle cost could potentially be
1357 // free, but we model it with a cost of LT.first so that ST2/ST4 have a
1358 // higher cost than just the store.
1359 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
1360 (LT.second.getScalarSizeInBits() == 8 ||
1361 LT.second.getScalarSizeInBits() == 16 ||
1362 LT.second.getScalarSizeInBits() == 32) &&
1363 LT.second.getSizeInBits() == 128 &&
1364 ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
1366 Mask, 2, SrcTy->getElementCount().getKnownMinValue() * 2)) ||
1367 (TLI->getMaxSupportedInterleaveFactor() == 4 &&
1369 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2))))
1370 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1371
1372 if (LT.second.isVector() &&
1373 Mask.size() <= LT.second.getVectorNumElements() &&
1374 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1375 isVREVMask(Mask, LT.second, 64)))
1376 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1377 }
1378 }
1379
1380 // Restore optimal kind.
1381 if (IsExtractSubvector)
1383 int BaseCost = ST->hasMVEIntegerOps() && SrcTy->isVectorTy()
1385 : 1;
1386 return BaseCost * BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind,
1387 Index, SubTp);
1388}
1389
1391 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1393 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
1394 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1395 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1396 // Make operations on i1 relatively expensive as this often involves
1397 // combining predicates. AND and XOR should be easier to handle with IT
1398 // blocks.
1399 switch (ISDOpcode) {
1400 default:
1401 break;
1402 case ISD::AND:
1403 case ISD::XOR:
1404 return 2;
1405 case ISD::OR:
1406 return 3;
1407 }
1408 }
1409
1410 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1411
1412 if (ST->hasNEON()) {
1413 const unsigned FunctionCallDivCost = 20;
1414 const unsigned ReciprocalDivCost = 10;
1415 static const CostTblEntry CostTbl[] = {
1416 // Division.
1417 // These costs are somewhat random. Choose a cost of 20 to indicate that
1418 // vectorizing devision (added function call) is going to be very expensive.
1419 // Double registers types.
1420 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1421 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1422 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1423 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1424 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1425 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1426 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1427 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1428 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1429 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1430 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1431 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1432 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1433 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1434 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1435 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1436 // Quad register types.
1437 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1438 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1439 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1440 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1441 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1442 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1443 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1444 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1445 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1446 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1447 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1448 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1449 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1450 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1451 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1452 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1453 // Multiplication.
1454 };
1455
1456 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1457 return LT.first * Entry->Cost;
1458
1460 Opcode, Ty, CostKind, Op1Info, Op2Info);
1461
1462 // This is somewhat of a hack. The problem that we are facing is that SROA
1463 // creates a sequence of shift, and, or instructions to construct values.
1464 // These sequences are recognized by the ISel and have zero-cost. Not so for
1465 // the vectorized code. Because we have support for v2i64 but not i64 those
1466 // sequences look particularly beneficial to vectorize.
1467 // To work around this we increase the cost of v2i64 operations to make them
1468 // seem less beneficial.
1469 if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1470 Cost += 4;
1471
1472 return Cost;
1473 }
1474
1475 // If this operation is a shift on arm/thumb2, it might well be folded into
1476 // the following instruction, hence having a cost of 0.
1477 auto LooksLikeAFreeShift = [&]() {
1478 if (ST->isThumb1Only() || Ty->isVectorTy())
1479 return false;
1480
1481 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1482 return false;
1483 if (!Op2Info.isUniform() || !Op2Info.isConstant())
1484 return false;
1485
1486 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1487 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1488 case Instruction::Add:
1489 case Instruction::Sub:
1490 case Instruction::And:
1491 case Instruction::Xor:
1492 case Instruction::Or:
1493 case Instruction::ICmp:
1494 return true;
1495 default:
1496 return false;
1497 }
1498 };
1499 if (LooksLikeAFreeShift())
1500 return 0;
1501
1502 // When targets have both DSP and MVE we find that the
1503 // the compiler will attempt to vectorize as well as using
1504 // scalar (S/U)MLAL operations. This is in cases where we have
1505 // the pattern ext(mul(ext(i16), ext(i16))) we find
1506 // that codegen performs better when only using (S/U)MLAL scalar
1507 // ops instead of trying to mix vector ops with (S/U)MLAL ops. We therefore
1508 // check if a mul instruction is used in a (U/S)MLAL pattern.
1509 auto MulInDSPMLALPattern = [&](const Instruction *I, unsigned Opcode,
1510 Type *Ty) -> bool {
1511 if (!ST->hasDSP())
1512 return false;
1513
1514 if (!I)
1515 return false;
1516
1517 if (Opcode != Instruction::Mul)
1518 return false;
1519
1520 if (Ty->isVectorTy())
1521 return false;
1522
1523 auto ValueOpcodesEqual = [](const Value *LHS, const Value *RHS) -> bool {
1524 return cast<Instruction>(LHS)->getOpcode() ==
1525 cast<Instruction>(RHS)->getOpcode();
1526 };
1527 auto IsExtInst = [](const Value *V) -> bool {
1528 return isa<ZExtInst>(V) || isa<SExtInst>(V);
1529 };
1530 auto IsExtensionFromHalf = [](const Value *V) -> bool {
1531 return cast<Instruction>(V)->getOperand(0)->getType()->isIntegerTy(16);
1532 };
1533
1534 // We check the arguments of the instruction to see if they're extends
1535 auto *BinOp = dyn_cast<BinaryOperator>(I);
1536 if (!BinOp)
1537 return false;
1538 Value *Op0 = BinOp->getOperand(0);
1539 Value *Op1 = BinOp->getOperand(1);
1540 if (IsExtInst(Op0) && IsExtInst(Op1) && ValueOpcodesEqual(Op0, Op1)) {
1541 // We're interested in an ext of an i16
1542 if (!I->getType()->isIntegerTy(32) || !IsExtensionFromHalf(Op0) ||
1543 !IsExtensionFromHalf(Op1))
1544 return false;
1545 // We need to check if this result will be further extended to i64
1546 // and that all these uses are SExt
1547 for (auto *U : I->users())
1548 if (!IsExtInst(U))
1549 return false;
1550 return true;
1551 }
1552
1553 return false;
1554 };
1555
1556 if (MulInDSPMLALPattern(CxtI, Opcode, Ty))
1557 return 0;
1558
1559 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1560 // for "multiple beats" potentially needed by MVE instructions.
1561 int BaseCost = 1;
1562 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1563 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1564
1565 // The rest of this mostly follows what is done in
1566 // BaseT::getArithmeticInstrCost, without treating floats as more expensive
1567 // that scalars or increasing the costs for custom operations. The results is
1568 // also multiplied by the MVEVectorCostFactor where appropriate.
1569 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1570 return LT.first * BaseCost;
1571
1572 // Else this is expand, assume that we need to scalarize this op.
1573 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1574 unsigned Num = VTy->getNumElements();
1577 // Return the cost of multiple scalar invocation plus the cost of
1578 // inserting and extracting the values.
1579 SmallVector<Type *> Tys(Args.size(), Ty);
1580 return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1581 Num * Cost;
1582 }
1583
1584 return BaseCost;
1585}
1586
1588 Align Alignment,
1589 unsigned AddressSpace,
1591 TTI::OperandValueInfo OpInfo,
1592 const Instruction *I) const {
1593 // TODO: Handle other cost kinds.
1595 return 1;
1596
1597 // Type legalization can't handle structs
1598 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1599 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1600 CostKind);
1601
1602 if (ST->hasNEON() && Src->isVectorTy() && Alignment != Align(16) &&
1603 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1604 // Unaligned loads/stores are extremely inefficient.
1605 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1606 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1607 return LT.first * 4;
1608 }
1609
1610 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1611 // Same for stores.
1612 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1613 ((Opcode == Instruction::Load && I->hasOneUse() &&
1614 isa<FPExtInst>(*I->user_begin())) ||
1615 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1616 FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
1617 Type *DstTy =
1618 Opcode == Instruction::Load
1619 ? (*I->user_begin())->getType()
1620 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1621 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1622 DstTy->getScalarType()->isFloatTy())
1623 return ST->getMVEVectorCostFactor(CostKind);
1624 }
1625
1626 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1628 : 1;
1629 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1630 CostKind, OpInfo, I);
1631}
1632
1634ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1635 unsigned AddressSpace,
1637 if (ST->hasMVEIntegerOps()) {
1638 if (Opcode == Instruction::Load &&
1639 isLegalMaskedLoad(Src, Alignment, AddressSpace))
1640 return ST->getMVEVectorCostFactor(CostKind);
1641 if (Opcode == Instruction::Store &&
1642 isLegalMaskedStore(Src, Alignment, AddressSpace))
1643 return ST->getMVEVectorCostFactor(CostKind);
1644 }
1645 if (!isa<FixedVectorType>(Src))
1646 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1647 CostKind);
1648 // Scalar cost, which is currently very high due to the efficiency of the
1649 // generated code.
1650 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1651}
1652
1654 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1655 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1656 bool UseMaskForCond, bool UseMaskForGaps) const {
1657 assert(Factor >= 2 && "Invalid interleave factor");
1658 assert(isa<VectorType>(VecTy) && "Expect a vector type");
1659
1660 // vldN/vstN doesn't support vector types of i64/f64 element.
1661 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1662
1663 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1664 !UseMaskForCond && !UseMaskForGaps) {
1665 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1666 auto *SubVecTy =
1667 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1668
1669 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1670 // Accesses having vector types that are a multiple of 128 bits can be
1671 // matched to more than one vldN/vstN instruction.
1672 int BaseCost =
1673 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1674 if (NumElts % Factor == 0 &&
1675 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1676 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1677
1678 // Some smaller than legal interleaved patterns are cheap as we can make
1679 // use of the vmovn or vrev patterns to interleave a standard load. This is
1680 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1681 // promoted differently). The cost of 2 here is then a load and vrev or
1682 // vmovn.
1683 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1684 VecTy->isIntOrIntVectorTy() &&
1685 DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
1686 return 2 * BaseCost;
1687 }
1688
1689 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1690 Alignment, AddressSpace, CostKind,
1691 UseMaskForCond, UseMaskForGaps);
1692}
1693
1695 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1696 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1697 using namespace PatternMatch;
1698 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1699 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1700 Alignment, CostKind, I);
1701
1702 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1703 auto *VTy = cast<FixedVectorType>(DataTy);
1704
1705 // TODO: Splitting, once we do that.
1706
1707 unsigned NumElems = VTy->getNumElements();
1708 unsigned EltSize = VTy->getScalarSizeInBits();
1709 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
1710
1711 // For now, it is assumed that for the MVE gather instructions the loads are
1712 // all effectively serialised. This means the cost is the scalar cost
1713 // multiplied by the number of elements being loaded. This is possibly very
1714 // conservative, but even so we still end up vectorising loops because the
1715 // cost per iteration for many loops is lower than for scalar loops.
1716 InstructionCost VectorCost =
1717 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1718 // The scalarization cost should be a lot higher. We use the number of vector
1719 // elements plus the scalarization overhead. If masking is required then a lot
1720 // of little blocks will be needed and potentially a scalarized p0 mask,
1721 // greatly increasing the cost.
1722 InstructionCost ScalarCost =
1723 NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1724 BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
1725 CostKind) +
1726 BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
1727 CostKind);
1728
1729 if (EltSize < 8 || Alignment < EltSize / 8)
1730 return ScalarCost;
1731
1732 unsigned ExtSize = EltSize;
1733 // Check whether there's a single user that asks for an extended type
1734 if (I != nullptr) {
1735 // Dependent of the caller of this function, a gather instruction will
1736 // either have opcode Instruction::Load or be a call to the masked_gather
1737 // intrinsic
1738 if ((I->getOpcode() == Instruction::Load ||
1739 match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
1740 I->hasOneUse()) {
1741 const User *Us = *I->users().begin();
1742 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1743 // only allow valid type combinations
1744 unsigned TypeSize =
1745 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1746 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1747 (TypeSize == 16 && EltSize == 8)) &&
1748 TypeSize * NumElems == 128) {
1749 ExtSize = TypeSize;
1750 }
1751 }
1752 }
1753 // Check whether the input data needs to be truncated
1754 TruncInst *T;
1755 if ((I->getOpcode() == Instruction::Store ||
1756 match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
1757 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1758 // Only allow valid type combinations
1759 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1760 if (((EltSize == 16 && TypeSize == 32) ||
1761 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1762 TypeSize * NumElems == 128)
1763 ExtSize = TypeSize;
1764 }
1765 }
1766
1767 if (ExtSize * NumElems != 128 || NumElems < 4)
1768 return ScalarCost;
1769
1770 // Any (aligned) i32 gather will not need to be scalarised.
1771 if (ExtSize == 32)
1772 return VectorCost;
1773 // For smaller types, we need to ensure that the gep's inputs are correctly
1774 // extended from a small enough value. Other sizes (including i64) are
1775 // scalarized for now.
1776 if (ExtSize != 8 && ExtSize != 16)
1777 return ScalarCost;
1778
1779 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1780 Ptr = BC->getOperand(0);
1781 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1782 if (GEP->getNumOperands() != 2)
1783 return ScalarCost;
1784 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1785 // Scale needs to be correct (which is only relevant for i16s).
1786 if (Scale != 1 && Scale * 8 != ExtSize)
1787 return ScalarCost;
1788 // And we need to zext (not sext) the indexes from a small enough type.
1789 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1790 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1791 return VectorCost;
1792 }
1793 return ScalarCost;
1794 }
1795 return ScalarCost;
1796}
1797
1800 std::optional<FastMathFlags> FMF,
1802
1803 EVT ValVT = TLI->getValueType(DL, ValTy);
1804 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1805 unsigned EltSize = ValVT.getScalarSizeInBits();
1806
1807 // In general floating point reductions are a series of elementwise
1808 // operations, with free extracts on each step. These are either in-order or
1809 // treewise depending on whether that is allowed by the fast math flags.
1810 if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
1811 ((EltSize == 32 && ST->hasVFP2Base()) ||
1812 (EltSize == 64 && ST->hasFP64()) ||
1813 (EltSize == 16 && ST->hasFullFP16()))) {
1814 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1815 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1816 InstructionCost VecCost = 0;
1817 while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(NumElts) &&
1818 NumElts * EltSize > VecLimit) {
1819 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1820 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1821 NumElts /= 2;
1822 }
1823
1824 // For fp16 we need to extract the upper lane elements. MVE can add a
1825 // VREV+FMIN/MAX to perform another vector step instead.
1826 InstructionCost ExtractCost = 0;
1827 if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1828 ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
1829 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1830 NumElts /= 2;
1831 } else if (ValVT.getVectorElementType() == MVT::f16)
1832 ExtractCost = NumElts / 2;
1833
1834 return VecCost + ExtractCost +
1835 NumElts *
1837 }
1838
1839 if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
1840 (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
1841 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1842 unsigned VecLimit =
1843 ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1844 InstructionCost VecCost = 0;
1845 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1846 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1847 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1848 NumElts /= 2;
1849 }
1850 // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1851 // step.
1852 if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
1853 NumElts * EltSize == 64) {
1854 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts);
1855 VecCost += ST->getMVEVectorCostFactor(CostKind) +
1856 getArithmeticInstrCost(Opcode, VecTy, CostKind);
1857 NumElts /= 2;
1858 }
1859
1860 // From here we extract the elements and perform the and/or/xor.
1861 InstructionCost ExtractCost = NumElts;
1862 return VecCost + ExtractCost +
1863 (NumElts - 1) * getArithmeticInstrCost(
1864 Opcode, ValTy->getElementType(), CostKind);
1865 }
1866
1867 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
1869 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1870
1871 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1872
1873 static const CostTblEntry CostTblAdd[]{
1874 {ISD::ADD, MVT::v16i8, 1},
1875 {ISD::ADD, MVT::v8i16, 1},
1876 {ISD::ADD, MVT::v4i32, 1},
1877 };
1878 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1879 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1880
1881 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1882}
1883
1885 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1886 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
1887 EVT ValVT = TLI->getValueType(DL, ValTy);
1888 EVT ResVT = TLI->getValueType(DL, ResTy);
1889
1890 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1891
1892 switch (ISD) {
1893 case ISD::ADD:
1894 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1895 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1896
1897 // The legal cases are:
1898 // VADDV u/s 8/16/32
1899 // VADDLV u/s 32
1900 // Codegen currently cannot always handle larger than legal vectors very
1901 // well, especially for predicated reductions where the mask needs to be
1902 // split, so restrict to 128bit or smaller input types.
1903 unsigned RevVTSize = ResVT.getSizeInBits();
1904 if (ValVT.getSizeInBits() <= 128 &&
1905 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1906 (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1907 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1908 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1909 }
1910 break;
1911 default:
1912 break;
1913 }
1914 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1915 CostKind);
1916}
1917
1920 VectorType *ValTy,
1922 EVT ValVT = TLI->getValueType(DL, ValTy);
1923 EVT ResVT = TLI->getValueType(DL, ResTy);
1924
1925 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1926 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1927
1928 // The legal cases are:
1929 // VMLAV u/s 8/16/32
1930 // VMLALV u/s 16/32
1931 // Codegen currently cannot always handle larger than legal vectors very
1932 // well, especially for predicated reductions where the mask needs to be
1933 // split, so restrict to 128bit or smaller input types.
1934 unsigned RevVTSize = ResVT.getSizeInBits();
1935 if (ValVT.getSizeInBits() <= 128 &&
1936 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1937 (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
1938 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1939 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1940 }
1941
1942 return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
1943}
1944
1947 FastMathFlags FMF,
1949 EVT ValVT = TLI->getValueType(DL, Ty);
1950
1951 // In general floating point reductions are a series of elementwise
1952 // operations, with free extracts on each step. These are either in-order or
1953 // treewise depending on whether that is allowed by the fast math flags.
1954 if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
1955 ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
1956 (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
1957 (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
1958 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
1959 unsigned EltSize = ValVT.getScalarSizeInBits();
1960 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1961 InstructionCost VecCost;
1962 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1963 Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);
1964 IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
1965 VecCost += getIntrinsicInstrCost(ICA, CostKind);
1966 NumElts /= 2;
1967 }
1968
1969 // For fp16 we need to extract the upper lane elements. MVE can add a
1970 // VREV+FMIN/MAX to perform another vector step instead.
1971 InstructionCost ExtractCost = 0;
1972 if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
1973 NumElts == 8) {
1974 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1975 NumElts /= 2;
1976 } else if (ValVT.getVectorElementType() == MVT::f16)
1977 ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;
1978
1980 {Ty->getElementType(), Ty->getElementType()},
1981 FMF);
1982 return VecCost + ExtractCost +
1983 (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
1984 }
1985
1986 if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
1987 IID == Intrinsic::umin || IID == Intrinsic::umax) {
1988 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1989
1990 // All costs are the same for u/s min/max. These lower to vminv, which are
1991 // given a slightly higher cost as they tend to take multiple cycles for
1992 // smaller type sizes.
1993 static const CostTblEntry CostTblAdd[]{
1994 {ISD::SMIN, MVT::v16i8, 4},
1995 {ISD::SMIN, MVT::v8i16, 3},
1996 {ISD::SMIN, MVT::v4i32, 2},
1997 };
1998 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))
1999 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
2000 }
2001
2002 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2003}
2004
2008 unsigned Opc = ICA.getID();
2009 switch (Opc) {
2010 case Intrinsic::get_active_lane_mask:
2011 // Currently we make a somewhat optimistic assumption that
2012 // active_lane_mask's are always free. In reality it may be freely folded
2013 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
2014 // of add/icmp code. We may need to improve this in the future, but being
2015 // able to detect if it is free or not involves looking at a lot of other
2016 // code. We currently assume that the vectorizer inserted these, and knew
2017 // what it was doing in adding one.
2018 if (ST->hasMVEIntegerOps())
2019 return 0;
2020 break;
2021 case Intrinsic::sadd_sat:
2022 case Intrinsic::ssub_sat:
2023 case Intrinsic::uadd_sat:
2024 case Intrinsic::usub_sat: {
2025 bool IsAdd = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
2026 bool IsSigned = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
2027 Type *RetTy = ICA.getReturnType();
2028
2029 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
2030 if (IsSigned && ST->hasDSP() && ITy->getBitWidth() == 32)
2031 return 1; // qadd / qsub
2032 if (ST->hasDSP() && (ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16))
2033 return 2; // uqadd16 / qadd16 / uqsub16 / qsub16 + possible extend.
2034 // Otherwise return the cost of expanding the node. Generally an add +
2035 // icmp + sel.
2037 Type *CondTy = RetTy->getWithNewBitWidth(1);
2038 return getArithmeticInstrCost(IsAdd ? Instruction::Add : Instruction::Sub,
2039 RetTy, CostKind) +
2040 2 * getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, Pred,
2041 CostKind) +
2042 2 * getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, Pred,
2043 CostKind);
2044 }
2045
2046 if (!ST->hasMVEIntegerOps())
2047 break;
2048
2049 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
2050 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2051 LT.second == MVT::v16i8) {
2052 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
2053 // need to extend the type, as it uses shr(qadd(shl, shl)).
2054 unsigned Instrs =
2055 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1
2056 : 4;
2057 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
2058 }
2059 break;
2060 }
2061 case Intrinsic::abs:
2062 case Intrinsic::smin:
2063 case Intrinsic::smax:
2064 case Intrinsic::umin:
2065 case Intrinsic::umax: {
2066 if (!ST->hasMVEIntegerOps())
2067 break;
2068 Type *VT = ICA.getReturnType();
2069
2070 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
2071 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2072 LT.second == MVT::v16i8)
2073 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2074 break;
2075 }
2076 case Intrinsic::minnum:
2077 case Intrinsic::maxnum: {
2078 if (!ST->hasMVEFloatOps())
2079 break;
2080 Type *VT = ICA.getReturnType();
2081 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
2082 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
2083 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2084 break;
2085 }
2086 case Intrinsic::fptosi_sat:
2087 case Intrinsic::fptoui_sat: {
2088 if (ICA.getArgTypes().empty())
2089 break;
2090 bool IsSigned = Opc == Intrinsic::fptosi_sat;
2091 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
2092 EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
2093 // Check for the legal types, with the corect subtarget features.
2094 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
2095 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
2096 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
2097 return LT.first;
2098
2099 // Equally for MVE vector types
2100 if (ST->hasMVEFloatOps() &&
2101 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
2102 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
2103 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2104
2105 // If we can we use a legal convert followed by a min+max
2106 if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
2107 (ST->hasFP64() && LT.second == MVT::f64) ||
2108 (ST->hasFullFP16() && LT.second == MVT::f16) ||
2109 (ST->hasMVEFloatOps() &&
2110 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
2111 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
2112 Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
2113 LT.second.getScalarSizeInBits());
2115 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
2116 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
2117 : Intrinsic::umin,
2118 LegalTy, {LegalTy, LegalTy});
2120 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
2121 : Intrinsic::umax,
2122 LegalTy, {LegalTy, LegalTy});
2124 return LT.first * Cost;
2125 }
2126 // Otherwise we need to follow the default expansion that clamps the value
2127 // using a float min/max with a fcmp+sel for nan handling when signed.
2128 Type *FPTy = ICA.getArgTypes()[0];
2129 Type *RetTy = ICA.getReturnType();
2130 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
2132 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
2134 Cost +=
2135 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
2137 if (IsSigned) {
2138 Type *CondTy = RetTy->getWithNewBitWidth(1);
2139 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
2141 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2143 }
2144 return Cost;
2145 }
2146 }
2147
2149}
2150
2152 if (!F->isIntrinsic())
2153 return BaseT::isLoweredToCall(F);
2154
2155 // Assume all Arm-specific intrinsics map to an instruction.
2156 if (F->getName().starts_with("llvm.arm"))
2157 return false;
2158
2159 switch (F->getIntrinsicID()) {
2160 default: break;
2161 case Intrinsic::powi:
2162 case Intrinsic::sin:
2163 case Intrinsic::cos:
2164 case Intrinsic::sincos:
2165 case Intrinsic::pow:
2166 case Intrinsic::log:
2167 case Intrinsic::log10:
2168 case Intrinsic::log2:
2169 case Intrinsic::exp:
2170 case Intrinsic::exp2:
2171 return true;
2172 case Intrinsic::sqrt:
2173 case Intrinsic::fabs:
2174 case Intrinsic::copysign:
2175 case Intrinsic::floor:
2176 case Intrinsic::ceil:
2177 case Intrinsic::trunc:
2178 case Intrinsic::rint:
2179 case Intrinsic::nearbyint:
2180 case Intrinsic::round:
2181 case Intrinsic::canonicalize:
2182 case Intrinsic::lround:
2183 case Intrinsic::llround:
2184 case Intrinsic::lrint:
2185 case Intrinsic::llrint:
2186 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2187 return true;
2188 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2189 return true;
2190 // Some operations can be handled by vector instructions and assume
2191 // unsupported vectors will be expanded into supported scalar ones.
2192 // TODO Handle scalar operations properly.
2193 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2194 case Intrinsic::masked_store:
2195 case Intrinsic::masked_load:
2196 case Intrinsic::masked_gather:
2197 case Intrinsic::masked_scatter:
2198 return !ST->hasMVEIntegerOps();
2199 case Intrinsic::sadd_with_overflow:
2200 case Intrinsic::uadd_with_overflow:
2201 case Intrinsic::ssub_with_overflow:
2202 case Intrinsic::usub_with_overflow:
2203 case Intrinsic::sadd_sat:
2204 case Intrinsic::uadd_sat:
2205 case Intrinsic::ssub_sat:
2206 case Intrinsic::usub_sat:
2207 return false;
2208 }
2209
2210 return BaseT::isLoweredToCall(F);
2211}
2212
2214 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
2215 EVT VT = TLI->getValueType(DL, I.getType(), true);
2216 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
2217 return true;
2218
2219 // Check if an intrinsic will be lowered to a call and assume that any
2220 // other CallInst will generate a bl.
2221 if (auto *Call = dyn_cast<CallInst>(&I)) {
2222 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
2223 switch(II->getIntrinsicID()) {
2224 case Intrinsic::memcpy:
2225 case Intrinsic::memset:
2226 case Intrinsic::memmove:
2227 return getNumMemOps(II) == -1;
2228 default:
2229 if (const Function *F = Call->getCalledFunction())
2230 return isLoweredToCall(F);
2231 }
2232 }
2233 return true;
2234 }
2235
2236 // FPv5 provides conversions between integer, double-precision,
2237 // single-precision, and half-precision formats.
2238 switch (I.getOpcode()) {
2239 default:
2240 break;
2241 case Instruction::FPToSI:
2242 case Instruction::FPToUI:
2243 case Instruction::SIToFP:
2244 case Instruction::UIToFP:
2245 case Instruction::FPTrunc:
2246 case Instruction::FPExt:
2247 return !ST->hasFPARMv8Base();
2248 }
2249
2250 // FIXME: Unfortunately the approach of checking the Operation Action does
2251 // not catch all cases of Legalization that use library calls. Our
2252 // Legalization step categorizes some transformations into library calls as
2253 // Custom, Expand or even Legal when doing type legalization. So for now
2254 // we have to special case for instance the SDIV of 64bit integers and the
2255 // use of floating point emulation.
2256 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
2257 switch (ISD) {
2258 default:
2259 break;
2260 case ISD::SDIV:
2261 case ISD::UDIV:
2262 case ISD::SREM:
2263 case ISD::UREM:
2264 case ISD::SDIVREM:
2265 case ISD::UDIVREM:
2266 return true;
2267 }
2268 }
2269
2270 // Assume all other non-float operations are supported.
2271 if (!VT.isFloatingPoint())
2272 return false;
2273
2274 // We'll need a library call to handle most floats when using soft.
2275 if (TLI->useSoftFloat()) {
2276 switch (I.getOpcode()) {
2277 default:
2278 return true;
2279 case Instruction::Alloca:
2280 case Instruction::Load:
2281 case Instruction::Store:
2282 case Instruction::Select:
2283 case Instruction::PHI:
2284 return false;
2285 }
2286 }
2287
2288 // We'll need a libcall to perform double precision operations on a single
2289 // precision only FPU.
2290 if (I.getType()->isDoubleTy() && !ST->hasFP64())
2291 return true;
2292
2293 // Likewise for half precision arithmetic.
2294 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2295 return true;
2296
2297 return false;
2298}
2299
2301 AssumptionCache &AC,
2302 TargetLibraryInfo *LibInfo,
2303 HardwareLoopInfo &HWLoopInfo) const {
2304 // Low-overhead branches are only supported in the 'low-overhead branch'
2305 // extension of v8.1-m.
2306 if (!ST->hasLOB() || DisableLowOverheadLoops) {
2307 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2308 return false;
2309 }
2310
2312 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2313 return false;
2314 }
2315
2316 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2317 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2318 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2319 return false;
2320 }
2321
2322 const SCEV *TripCountSCEV =
2323 SE.getAddExpr(BackedgeTakenCount,
2324 SE.getOne(BackedgeTakenCount->getType()));
2325
2326 // We need to store the trip count in LR, a 32-bit register.
2327 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2328 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2329 return false;
2330 }
2331
2332 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2333 // point in generating a hardware loop if that's going to happen.
2334
2335 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2336 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2337 switch (Call->getIntrinsicID()) {
2338 default:
2339 break;
2340 case Intrinsic::start_loop_iterations:
2341 case Intrinsic::test_start_loop_iterations:
2342 case Intrinsic::loop_decrement:
2343 case Intrinsic::loop_decrement_reg:
2344 return true;
2345 }
2346 }
2347 return false;
2348 };
2349
2350 // Scan the instructions to see if there's any that we know will turn into a
2351 // call or if this loop is already a low-overhead loop or will become a tail
2352 // predicated loop.
2353 bool IsTailPredLoop = false;
2354 auto ScanLoop = [&](Loop *L) {
2355 for (auto *BB : L->getBlocks()) {
2356 for (auto &I : *BB) {
2357 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2358 isa<InlineAsm>(I)) {
2359 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2360 return false;
2361 }
2362 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2363 IsTailPredLoop |=
2364 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2365 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2366 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2367 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2368 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2369 }
2370 }
2371 return true;
2372 };
2373
2374 // Visit inner loops.
2375 for (auto *Inner : *L)
2376 if (!ScanLoop(Inner))
2377 return false;
2378
2379 if (!ScanLoop(L))
2380 return false;
2381
2382 // TODO: Check whether the trip count calculation is expensive. If L is the
2383 // inner loop but we know it has a low trip count, calculating that trip
2384 // count (in the parent loop) may be detrimental.
2385
2386 LLVMContext &C = L->getHeader()->getContext();
2387 HWLoopInfo.CounterInReg = true;
2388 HWLoopInfo.IsNestingLegal = false;
2389 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2390 HWLoopInfo.CountType = Type::getInt32Ty(C);
2391 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2392 return true;
2393}
2394
2395static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2396 // We don't allow icmp's, and because we only look at single block loops,
2397 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2398 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2399 return false;
2400 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2401 // not currently canonical, but soon will be. Code without them uses icmp, and
2402 // so is not tail predicated as per the condition above. In order to get the
2403 // same performance we treat min and max the same as an icmp for tailpred
2404 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2405 // pick more optimial instructions like VQDMULH. They need to be recognized
2406 // directly by the vectorizer).
2407 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2408 if ((II->getIntrinsicID() == Intrinsic::smin ||
2409 II->getIntrinsicID() == Intrinsic::smax ||
2410 II->getIntrinsicID() == Intrinsic::umin ||
2411 II->getIntrinsicID() == Intrinsic::umax) &&
2412 ++ICmpCount > 1)
2413 return false;
2414
2415 if (isa<FCmpInst>(&I))
2416 return false;
2417
2418 // We could allow extending/narrowing FP loads/stores, but codegen is
2419 // too inefficient so reject this for now.
2420 if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
2421 return false;
2422
2423 // Extends have to be extending-loads
2424 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2425 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2426 return false;
2427
2428 // Truncs have to be narrowing-stores
2429 if (isa<TruncInst>(&I) )
2430 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2431 return false;
2432
2433 return true;
2434}
2435
2436// To set up a tail-predicated loop, we need to know the total number of
2437// elements processed by that loop. Thus, we need to determine the element
2438// size and:
2439// 1) it should be uniform for all operations in the vector loop, so we
2440// e.g. don't want any widening/narrowing operations.
2441// 2) it should be smaller than i64s because we don't have vector operations
2442// that work on i64s.
2443// 3) we don't want elements to be reversed or shuffled, to make sure the
2444// tail-predication masks/predicates the right lanes.
2445//
2447 const DataLayout &DL,
2448 const LoopAccessInfo *LAI) {
2449 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2450
2451 // If there are live-out values, it is probably a reduction. We can predicate
2452 // most reduction operations freely under MVE using a combination of
2453 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2454 // floating point and integer reductions, but don't check for operators
2455 // specifically here. If the value ends up not being a reduction (and so the
2456 // vectorizer cannot tailfold the loop), we should fall back to standard
2457 // vectorization automatically.
2459 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2460 bool ReductionsDisabled =
2463
2464 for (auto *I : LiveOuts) {
2465 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2466 !I->getType()->isHalfTy()) {
2467 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2468 "live-out value\n");
2469 return false;
2470 }
2471 if (ReductionsDisabled) {
2472 LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2473 return false;
2474 }
2475 }
2476
2477 // Next, check that all instructions can be tail-predicated.
2478 PredicatedScalarEvolution PSE = LAI->getPSE();
2479 int ICmpCount = 0;
2480
2481 for (BasicBlock *BB : L->blocks()) {
2482 for (Instruction &I : BB->instructionsWithoutDebug()) {
2483 if (isa<PHINode>(&I))
2484 continue;
2485 if (!canTailPredicateInstruction(I, ICmpCount)) {
2486 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2487 return false;
2488 }
2489
2490 Type *T = I.getType();
2491 if (T->getScalarSizeInBits() > 32) {
2492 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2493 return false;
2494 }
2495 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2497 Type *AccessTy = getLoadStoreType(&I);
2498 int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
2499 if (NextStride == 1) {
2500 // TODO: for now only allow consecutive strides of 1. We could support
2501 // other strides as long as it is uniform, but let's keep it simple
2502 // for now.
2503 continue;
2504 } else if (NextStride == -1 ||
2505 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2506 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2508 << "Consecutive strides of 2 found, vld2/vstr2 can't "
2509 "be tail-predicated\n.");
2510 return false;
2511 // TODO: don't tail predicate if there is a reversed load?
2512 } else if (EnableMaskedGatherScatters) {
2513 // Gather/scatters do allow loading from arbitrary strides, at
2514 // least if they are loop invariant.
2515 // TODO: Loop variant strides should in theory work, too, but
2516 // this requires further testing.
2517 const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2518 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2519 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2520 if (PSE.getSE()->isLoopInvariant(Step, L))
2521 continue;
2522 }
2523 }
2524 LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2525 "tail-predicate\n.");
2526 return false;
2527 }
2528 }
2529 }
2530
2531 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2532 return true;
2533}
2534
2536 if (!EnableTailPredication) {
2537 LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
2538 return false;
2539 }
2540
2541 // Creating a predicated vector loop is the first step for generating a
2542 // tail-predicated hardware loop, for which we need the MVE masked
2543 // load/stores instructions:
2544 if (!ST->hasMVEIntegerOps())
2545 return false;
2546
2547 LoopVectorizationLegality *LVL = TFI->LVL;
2548 Loop *L = LVL->getLoop();
2549
2550 // For now, restrict this to single block loops.
2551 if (L->getNumBlocks() > 1) {
2552 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
2553 "loop.\n");
2554 return false;
2555 }
2556
2557 assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
2558
2559 LoopInfo *LI = LVL->getLoopInfo();
2560 HardwareLoopInfo HWLoopInfo(L);
2561 if (!HWLoopInfo.canAnalyze(*LI)) {
2562 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2563 "analyzable.\n");
2564 return false;
2565 }
2566
2569
2570 // This checks if we have the low-overhead branch architecture
2571 // extension, and if we will create a hardware-loop:
2572 if (!isHardwareLoopProfitable(L, *SE, *AC, TFI->TLI, HWLoopInfo)) {
2573 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2574 "profitable.\n");
2575 return false;
2576 }
2577
2578 DominatorTree *DT = LVL->getDominatorTree();
2579 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) {
2580 LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
2581 "a candidate.\n");
2582 return false;
2583 }
2584
2585 return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI());
2586}
2587
2589ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
2590 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2592
2593 // Intrinsic @llvm.get.active.lane.mask is supported.
2594 // It is used in the MVETailPredication pass, which requires the number of
2595 // elements processed by this vector loop to setup the tail-predicated
2596 // loop.
2598}
2601 OptimizationRemarkEmitter *ORE) const {
2602 // Enable Upper bound unrolling universally, providing that we do not see an
2603 // active lane mask, which will be better kept as a loop to become tail
2604 // predicated than to be conditionally unrolled.
2605 UP.UpperBound =
2606 !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) {
2607 return isa<IntrinsicInst>(I) &&
2608 cast<IntrinsicInst>(I).getIntrinsicID() ==
2609 Intrinsic::get_active_lane_mask;
2610 });
2611
2612 // Only currently enable these preferences for M-Class cores.
2613 if (!ST->isMClass())
2614 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2615
2616 // Disable loop unrolling for Oz and Os.
2617 UP.OptSizeThreshold = 0;
2619 if (L->getHeader()->getParent()->hasOptSize())
2620 return;
2621
2622 SmallVector<BasicBlock*, 4> ExitingBlocks;
2623 L->getExitingBlocks(ExitingBlocks);
2624 LLVM_DEBUG(dbgs() << "Loop has:\n"
2625 << "Blocks: " << L->getNumBlocks() << "\n"
2626 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2627
2628 // Only allow another exit other than the latch. This acts as an early exit
2629 // as it mirrors the profitability calculation of the runtime unroller.
2630 if (ExitingBlocks.size() > 2)
2631 return;
2632
2633 // Limit the CFG of the loop body for targets with a branch predictor.
2634 // Allowing 4 blocks permits if-then-else diamonds in the body.
2635 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2636 return;
2637
2638 // Don't unroll vectorized loops, including the remainder loop
2639 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2640 return;
2641
2642 // Scan the loop: don't unroll loops with calls as this could prevent
2643 // inlining.
2645 for (auto *BB : L->getBlocks()) {
2646 for (auto &I : *BB) {
2647 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2648 // scalar code.
2649 if (I.getType()->isVectorTy())
2650 return;
2651
2652 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2653 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2654 if (!isLoweredToCall(F))
2655 continue;
2656 }
2657 return;
2658 }
2659
2660 SmallVector<const Value*, 4> Operands(I.operand_values());
2663 }
2664 }
2665
2666 // On v6m cores, there are very few registers available. We can easily end up
2667 // spilling and reloading more registers in an unrolled loop. Look at the
2668 // number of LCSSA phis as a rough measure of how many registers will need to
2669 // be live out of the loop, reducing the default unroll count if more than 1
2670 // value is needed. In the long run, all of this should be being learnt by a
2671 // machine.
2672 unsigned UnrollCount = 4;
2673 if (ST->isThumb1Only()) {
2674 unsigned ExitingValues = 0;
2676 L->getExitBlocks(ExitBlocks);
2677 for (auto *Exit : ExitBlocks) {
2678 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2679 // only the last is expected to be needed for address operands.
2680 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2681 return PH.getNumOperands() != 1 ||
2682 !isa<GetElementPtrInst>(PH.getOperand(0));
2683 });
2684 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2685 }
2686 if (ExitingValues)
2687 UnrollCount /= ExitingValues;
2688 if (UnrollCount <= 1)
2689 return;
2690 }
2691
2692 // For processors with low overhead branching (LOB), runtime unrolling the
2693 // innermost loop is often detrimental to performance. In these cases the loop
2694 // remainder gets unrolled into a series of compare-and-jump blocks, which in
2695 // deeply nested loops get executed multiple times, negating the benefits of
2696 // LOB. This is particularly noticable when the loop trip count of the
2697 // innermost loop varies within the outer loop, such as in the case of
2698 // triangular matrix decompositions. In these cases we will prefer to not
2699 // unroll the innermost loop, with the intention for it to be executed as a
2700 // low overhead loop.
2701 bool Runtime = true;
2702 if (ST->hasLOB()) {
2704 const auto *BETC = SE.getBackedgeTakenCount(L);
2705 auto *Outer = L->getOutermostLoop();
2706 if ((L != Outer && Outer != L->getParentLoop()) ||
2707 (L != Outer && BETC && !SE.isLoopInvariant(BETC, Outer))) {
2708 Runtime = false;
2709 }
2710 }
2711 }
2712
2713 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2714 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2715
2716 UP.Partial = true;
2717 UP.Runtime = Runtime;
2718 UP.UnrollRemainder = true;
2720 UP.UnrollAndJam = true;
2722
2723 // Force unrolling small loops can be very useful because of the branch
2724 // taken cost of the backedge.
2725 if (Cost < 12)
2726 UP.Force = true;
2727}
2728
2730 TTI::PeelingPreferences &PP) const {
2732}
2733
2735 if (!ST->hasMVEIntegerOps())
2736 return false;
2737
2738 unsigned ScalarBits = Ty->getScalarSizeInBits();
2739 switch (Kind) {
2740 case RecurKind::Add:
2741 return ScalarBits <= 64;
2742 default:
2743 return false;
2744 }
2745}
2746
2748 if (!ST->hasMVEIntegerOps())
2749 return false;
2750 return true;
2751}
2752
2754 StackOffset BaseOffset,
2755 bool HasBaseReg, int64_t Scale,
2756 unsigned AddrSpace) const {
2758 AM.BaseGV = BaseGV;
2759 AM.BaseOffs = BaseOffset.getFixed();
2760 AM.HasBaseReg = HasBaseReg;
2761 AM.Scale = Scale;
2762 AM.ScalableOffset = BaseOffset.getScalable();
2763 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2764 if (ST->hasFPAO())
2765 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2766 return 0;
2767 }
2769}
2770
2771bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2772 if (Thumb) {
2773 // B.W is available in any Thumb2-supporting target, and also in every
2774 // version of Armv8-M, even Baseline which does not include the rest of
2775 // Thumb2.
2776 return ST->isThumb2() || ST->hasV8MBaselineOps();
2777 } else {
2778 // B is available in all versions of the Arm ISA, so the only question is
2779 // whether that ISA is available at all.
2780 return ST->hasARMOps();
2781 }
2782}
2783
2784/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
2785/// of the vector elements.
2786static bool areExtractExts(Value *Ext1, Value *Ext2) {
2787 using namespace PatternMatch;
2788
2789 auto areExtDoubled = [](Instruction *Ext) {
2790 return Ext->getType()->getScalarSizeInBits() ==
2791 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
2792 };
2793
2794 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
2795 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
2796 !areExtDoubled(cast<Instruction>(Ext1)) ||
2797 !areExtDoubled(cast<Instruction>(Ext2)))
2798 return false;
2799
2800 return true;
2801}
2802
2803/// Check if sinking \p I's operands to I's basic block is profitable, because
2804/// the operands can be folded into a target instruction, e.g.
2805/// sext/zext can be folded into vsubl.
2807 SmallVectorImpl<Use *> &Ops) const {
2808 using namespace PatternMatch;
2809
2810 if (!I->getType()->isVectorTy())
2811 return false;
2812
2813 if (ST->hasNEON()) {
2814 switch (I->getOpcode()) {
2815 case Instruction::Sub:
2816 case Instruction::Add: {
2817 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
2818 return false;
2819 Ops.push_back(&I->getOperandUse(0));
2820 Ops.push_back(&I->getOperandUse(1));
2821 return true;
2822 }
2823 default:
2824 return false;
2825 }
2826 }
2827
2828 if (!ST->hasMVEIntegerOps())
2829 return false;
2830
2831 auto IsFMSMul = [&](Instruction *I) {
2832 if (!I->hasOneUse())
2833 return false;
2834 auto *Sub = cast<Instruction>(*I->users().begin());
2835 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
2836 };
2837 auto IsFMS = [&](Instruction *I) {
2838 if (match(I->getOperand(0), m_FNeg(m_Value())) ||
2839 match(I->getOperand(1), m_FNeg(m_Value())))
2840 return true;
2841 return false;
2842 };
2843
2844 auto IsSinker = [&](Instruction *I, int Operand) {
2845 switch (I->getOpcode()) {
2846 case Instruction::Add:
2847 case Instruction::Mul:
2848 case Instruction::FAdd:
2849 case Instruction::ICmp:
2850 case Instruction::FCmp:
2851 return true;
2852 case Instruction::FMul:
2853 return !IsFMSMul(I);
2854 case Instruction::Sub:
2855 case Instruction::FSub:
2856 case Instruction::Shl:
2857 case Instruction::LShr:
2858 case Instruction::AShr:
2859 return Operand == 1;
2860 case Instruction::Call:
2861 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
2862 switch (II->getIntrinsicID()) {
2863 case Intrinsic::fma:
2864 return !IsFMS(I);
2865 case Intrinsic::sadd_sat:
2866 case Intrinsic::uadd_sat:
2867 case Intrinsic::arm_mve_add_predicated:
2868 case Intrinsic::arm_mve_mul_predicated:
2869 case Intrinsic::arm_mve_qadd_predicated:
2870 case Intrinsic::arm_mve_vhadd:
2871 case Intrinsic::arm_mve_hadd_predicated:
2872 case Intrinsic::arm_mve_vqdmull:
2873 case Intrinsic::arm_mve_vqdmull_predicated:
2874 case Intrinsic::arm_mve_vqdmulh:
2875 case Intrinsic::arm_mve_qdmulh_predicated:
2876 case Intrinsic::arm_mve_vqrdmulh:
2877 case Intrinsic::arm_mve_qrdmulh_predicated:
2878 case Intrinsic::arm_mve_fma_predicated:
2879 return true;
2880 case Intrinsic::ssub_sat:
2881 case Intrinsic::usub_sat:
2882 case Intrinsic::arm_mve_sub_predicated:
2883 case Intrinsic::arm_mve_qsub_predicated:
2884 case Intrinsic::arm_mve_hsub_predicated:
2885 case Intrinsic::arm_mve_vhsub:
2886 return Operand == 1;
2887 default:
2888 return false;
2889 }
2890 }
2891 return false;
2892 default:
2893 return false;
2894 }
2895 };
2896
2897 for (auto OpIdx : enumerate(I->operands())) {
2898 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
2899 // Make sure we are not already sinking this operand
2900 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
2901 continue;
2902
2903 Instruction *Shuffle = Op;
2904 if (Shuffle->getOpcode() == Instruction::BitCast)
2905 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
2906 // We are looking for a splat that can be sunk.
2907 if (!Shuffle || !match(Shuffle, m_Shuffle(m_InsertElt(m_Undef(), m_Value(),
2908 m_ZeroInt()),
2909 m_Undef(), m_ZeroMask())))
2910 continue;
2911 if (!IsSinker(I, OpIdx.index()))
2912 continue;
2913
2914 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
2915 // and vector registers
2916 for (Use &U : Op->uses()) {
2917 Instruction *Insn = cast<Instruction>(U.getUser());
2918 if (!IsSinker(Insn, U.getOperandNo()))
2919 return false;
2920 }
2921
2922 Ops.push_back(&Shuffle->getOperandUse(0));
2923 if (Shuffle != Op)
2924 Ops.push_back(&Op->getOperandUse(0));
2925 Ops.push_back(&OpIdx.value());
2926 }
2927 return true;
2928}
2929
2931 Type *ArrayType) const {
2932 if (!UseWidenGlobalArrays) {
2933 LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
2934 return false;
2935 }
2936
2937 // Don't modify none integer array types
2938 if (!ArrayType || !ArrayType->isArrayTy() ||
2940 return 0;
2941
2942 // We pad to 4 byte boundaries
2943 if (Size % 4 == 0)
2944 return 0;
2945
2946 unsigned NumBytesToPad = 4 - (Size % 4);
2947 unsigned NewSize = Size + NumBytesToPad;
2948
2949 // Max number of bytes that memcpy allows for lowering to load/stores before
2950 // it uses library function (__aeabi_memcpy).
2951 unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
2952
2953 if (NewSize > MaxMemIntrinsicSize)
2954 return 0;
2955
2956 return NumBytesToPad;
2957}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
This file implements a class to represent arbitrary precision integral constant values and operations...
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static Value * isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm)
static cl::opt< bool > UseWidenGlobalArrays("widen-global-strings", cl::Hidden, cl::init(true), cl::desc("Enable the widening of global strings to alignment boundaries"))
cl::opt< bool > EnableMaskedGatherScatters
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor
cl::opt< TailPredication::Mode > EnableTailPredication
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI)
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
Hexagon Common GEP
This file provides the interface for the instcombine pass implementation.
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition: Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:78
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:651
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
bool hasARMOps() const
Definition: ARMSubtarget.h:298
bool isThumb1Only() const
Definition: ARMSubtarget.h:375
bool hasFPARMv8Base() const
Definition: ARMSubtarget.h:307
bool isThumb2() const
Definition: ARMSubtarget.h:376
bool hasVFP2Base() const
Definition: ARMSubtarget.h:304
bool isMClass() const
Definition: ARMSubtarget.h:377
unsigned getMVEVectorCostFactor(TargetTransformInfo::TargetCostKind CostKind) const
Definition: ARMSubtarget.h:491
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMemcpyCost(const Instruction *I) const override
bool maybeLoweredToCall(Instruction &I) const
bool preferInLoopReduction(RecurKind Kind, Type *Ty) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool hasArmWideBranch(bool Thumb) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty) const override
bool isLoweredToCall(const Function *F) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool isLegalMaskedStore(Type *DataTy, Align Alignment, unsigned AddressSpace) const override
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool isLegalMaskedLoad(Type *DataTy, Align Alignment, unsigned AddressSpace) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind) const override
bool preferPredicatedReductionSelect() const override
bool isLegalMaskedGather(Type *Ty, Align Alignment) const override
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const override
bool isProfitableLSRChainElement(Instruction *I) const override
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
getScalingFactorCost - Return the cost of the scaling used in addressing mode represented by AM.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool useSoftFloat() const override
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Class to represent array types.
Definition: DerivedTypes.h:398
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:234
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:888
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
Definition: BasicTTIImpl.h:459
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
Compute a cost of the given call instruction.
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition: BasicTTIImpl.h:702
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition: BasicTTIImpl.h:774
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:997
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:984
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:708
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:705
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:688
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:154
This class represents a range of values.
Definition: ConstantRange.h:47
LLVM_ABI ConstantRange intersectWith(const ConstantRange &CR, PreferredRangeType Type=Smallest) const
Return the range that results from the intersection of this range with another range.
This is an important base class in LLVM.
Definition: Constant.h:43
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:674
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:165
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:22
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
unsigned getNumElements() const
Definition: DerivedTypes.h:635
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1864
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1115
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:502
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:834
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:207
The core instruction combiner logic.
Definition: InstCombiner.h:48
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:337
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:336
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:388
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:412
BuilderTy & Builder
Definition: InstCombiner.h:61
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:334
static InstructionCost getInvalid(CostType Val=0)
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:171
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:312
bool isShift() const
Definition: Instruction.h:320
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
Drive the analysis of memory accesses in the loop.
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
AssumptionCache * getAssumptionCache() const
const LoopAccessInfo * getLAI() const
ScalarEvolution * getScalarEvolution() const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
The optimization diagnostic interface.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:34
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:44
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:43
Provides information about what library functions are available for the current target.
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxStoresPerMemcpy(bool OptSize) const
Get maximum # of store operations permitted for llvm.memcpy.
unsigned getMaxStoresPerMemmove(bool OptSize) const
Get maximum # of store operations permitted for llvm.memmove.
unsigned getMaxStoresPerMemset(bool OptSize) const
Get maximum # of store operations permitted for llvm.memset.
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
virtual bool isLoweredToCall(const Function *F) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:273
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:264
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:246
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Type * getArrayElementType() const
Definition: Type.h:408
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
LLVM_ABI unsigned getIntegerBitWidth() const
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
const Use & getOperandUse(unsigned i) const
Definition: User.h:245
Value * getOperand(unsigned i) const
Definition: User.h:232
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
user_iterator user_begin()
Definition: Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:150
Base class of all SIMD vector types.
Definition: DerivedTypes.h:430
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:695
Type * getElementType() const
Definition: DerivedTypes.h:463
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:169
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:862
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:275
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:826
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:778
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:636
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:832
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:718
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:960
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:908
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:838
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
Definition: PatternMatch.h:165
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:962
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1121
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
@ Runtime
Detect stack use after return if not disabled runtime with (ASAN_OPTIONS=detect_stack_use_after_retur...
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition: Local.h:252
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
LLVM_ABI SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
Definition: LoopUtils.cpp:124
SelectPatternFlavor
Specific patterns of select instructions we can match.
@ SPF_ABS
Floating point maxnum.
@ SPF_FMAXNUM
Floating point minnum.
@ SPF_UMIN
Signed minimum.
@ SPF_UMAX
Signed maximum.
@ SPF_SMIN
@ SPF_SMAX
Unsigned minimum.
@ SPF_FMINNUM
Unsigned maximum.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
LLVM_ABI SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1980
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
InstructionCost Cost
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool isFixedLengthVector() const
Definition: ValueTypes.h:181
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
Attributes of a target dependent hardware loop.
LLVM_ABI bool canAnalyze(LoopInfo &LI)
LLVM_ABI bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
SelectPatternFlavor Flavor
TargetLibraryInfo * TLI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Type Conversion Cost Table.
Definition: CostTable.h:55