LLVM 22.0.0git
BasicTTIImpl.h
Go to the documentation of this file.
1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/BitVector.h"
21#include "llvm/ADT/STLExtras.h"
35#include "llvm/IR/BasicBlock.h"
36#include "llvm/IR/Constant.h"
37#include "llvm/IR/Constants.h"
38#include "llvm/IR/DataLayout.h"
40#include "llvm/IR/InstrTypes.h"
41#include "llvm/IR/Instruction.h"
43#include "llvm/IR/Intrinsics.h"
44#include "llvm/IR/Operator.h"
45#include "llvm/IR/Type.h"
46#include "llvm/IR/Value.h"
54#include <algorithm>
55#include <cassert>
56#include <cstdint>
57#include <limits>
58#include <optional>
59#include <utility>
60
61namespace llvm {
62
63class Function;
64class GlobalValue;
65class LLVMContext;
66class ScalarEvolution;
67class SCEV;
68class TargetMachine;
69
71
72/// Base class which can be used to help build a TTI implementation.
73///
74/// This class provides as much implementation of the TTI interface as is
75/// possible using the target independent parts of the code generator.
76///
77/// In order to subclass it, your class must implement a getST() method to
78/// return the subtarget, and a getTLI() method to return the target lowering.
79/// We need these methods implemented in the derived class so that this class
80/// doesn't have to duplicate storage for them.
81template <typename T>
83private:
86
87 /// Helper function to access this as a T.
88 const T *thisT() const { return static_cast<const T *>(this); }
89
90 /// Estimate a cost of Broadcast as an extract and sequence of insert
91 /// operations.
93 getBroadcastShuffleOverhead(FixedVectorType *VTy,
96 // Broadcast cost is equal to the cost of extracting the zero'th element
97 // plus the cost of inserting it into every element of the result vector.
98 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
99 CostKind, 0, nullptr, nullptr);
100
101 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
102 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
103 CostKind, i, nullptr, nullptr);
104 }
105 return Cost;
106 }
107
108 /// Estimate a cost of shuffle as a sequence of extract and insert
109 /// operations.
111 getPermuteShuffleOverhead(FixedVectorType *VTy,
114 // Shuffle cost is equal to the cost of extracting element from its argument
115 // plus the cost of inserting them onto the result vector.
116
117 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
118 // index 0 of first vector, index 1 of second vector,index 2 of first
119 // vector and finally index 3 of second vector and insert them at index
120 // <0,1,2,3> of result vector.
121 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
122 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
123 CostKind, i, nullptr, nullptr);
124 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
125 CostKind, i, nullptr, nullptr);
126 }
127 return Cost;
128 }
129
130 /// Estimate a cost of subvector extraction as a sequence of extract and
131 /// insert operations.
132 InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
134 int Index,
135 FixedVectorType *SubVTy) const {
136 assert(VTy && SubVTy &&
137 "Can only extract subvectors from vectors");
138 int NumSubElts = SubVTy->getNumElements();
139 assert((!isa<FixedVectorType>(VTy) ||
140 (Index + NumSubElts) <=
141 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
142 "SK_ExtractSubvector index out of range");
143
145 // Subvector extraction cost is equal to the cost of extracting element from
146 // the source type plus the cost of inserting them into the result vector
147 // type.
148 for (int i = 0; i != NumSubElts; ++i) {
149 Cost +=
150 thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
151 CostKind, i + Index, nullptr, nullptr);
152 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
153 CostKind, i, nullptr, nullptr);
154 }
155 return Cost;
156 }
157
158 /// Estimate a cost of subvector insertion as a sequence of extract and
159 /// insert operations.
160 InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
162 int Index,
163 FixedVectorType *SubVTy) const {
164 assert(VTy && SubVTy &&
165 "Can only insert subvectors into vectors");
166 int NumSubElts = SubVTy->getNumElements();
167 assert((!isa<FixedVectorType>(VTy) ||
168 (Index + NumSubElts) <=
169 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
170 "SK_InsertSubvector index out of range");
171
173 // Subvector insertion cost is equal to the cost of extracting element from
174 // the source type plus the cost of inserting them into the result vector
175 // type.
176 for (int i = 0; i != NumSubElts; ++i) {
177 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
178 CostKind, i, nullptr, nullptr);
179 Cost +=
180 thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
181 i + Index, nullptr, nullptr);
182 }
183 return Cost;
184 }
185
186 /// Local query method delegates up to T which *must* implement this!
187 const TargetSubtargetInfo *getST() const {
188 return static_cast<const T *>(this)->getST();
189 }
190
191 /// Local query method delegates up to T which *must* implement this!
192 const TargetLoweringBase *getTLI() const {
193 return static_cast<const T *>(this)->getTLI();
194 }
195
196 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
197 switch (M) {
199 return ISD::UNINDEXED;
200 case TTI::MIM_PreInc:
201 return ISD::PRE_INC;
202 case TTI::MIM_PreDec:
203 return ISD::PRE_DEC;
204 case TTI::MIM_PostInc:
205 return ISD::POST_INC;
206 case TTI::MIM_PostDec:
207 return ISD::POST_DEC;
208 }
209 llvm_unreachable("Unexpected MemIndexedMode");
210 }
211
212 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
213 Align Alignment,
214 bool VariableMask,
215 bool IsGatherScatter,
217 unsigned AddressSpace = 0) const {
218 // We cannot scalarize scalable vectors, so return Invalid.
219 if (isa<ScalableVectorType>(DataTy))
221
222 auto *VT = cast<FixedVectorType>(DataTy);
223 unsigned VF = VT->getNumElements();
224
225 // Assume the target does not have support for gather/scatter operations
226 // and provide a rough estimate.
227 //
228 // First, compute the cost of the individual memory operations.
229 InstructionCost AddrExtractCost =
230 IsGatherScatter ? getScalarizationOverhead(
232 PointerType::get(VT->getContext(), 0), VF),
233 /*Insert=*/false, /*Extract=*/true, CostKind)
234 : 0;
235
236 // The cost of the scalar loads/stores.
237 InstructionCost MemoryOpCost =
238 VF * thisT()->getMemoryOpCost(Opcode, VT->getElementType(), Alignment,
240
241 // Next, compute the cost of packing the result in a vector.
242 InstructionCost PackingCost =
243 getScalarizationOverhead(VT, Opcode != Instruction::Store,
244 Opcode == Instruction::Store, CostKind);
245
246 InstructionCost ConditionalCost = 0;
247 if (VariableMask) {
248 // Compute the cost of conditionally executing the memory operations with
249 // variable masks. This includes extracting the individual conditions, a
250 // branches and PHIs to combine the results.
251 // NOTE: Estimating the cost of conditionally executing the memory
252 // operations accurately is quite difficult and the current solution
253 // provides a very rough estimate only.
254 ConditionalCost =
257 /*Insert=*/false, /*Extract=*/true, CostKind) +
258 VF * (thisT()->getCFInstrCost(Instruction::Br, CostKind) +
259 thisT()->getCFInstrCost(Instruction::PHI, CostKind));
260 }
261
262 return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
263 }
264
265 /// Checks if the provided mask \p is a splat mask, i.e. it contains only -1
266 /// or same non -1 index value and this index value contained at least twice.
267 /// So, mask <0, -1,-1, -1> is not considered splat (it is just identity),
268 /// same for <-1, 0, -1, -1> (just a slide), while <2, -1, 2, -1> is a splat
269 /// with \p Index=2.
270 static bool isSplatMask(ArrayRef<int> Mask, unsigned NumSrcElts, int &Index) {
271 // Check that the broadcast index meets at least twice.
272 bool IsCompared = false;
273 if (int SplatIdx = PoisonMaskElem;
274 all_of(enumerate(Mask), [&](const auto &P) {
275 if (P.value() == PoisonMaskElem)
276 return P.index() != Mask.size() - 1 || IsCompared;
277 if (static_cast<unsigned>(P.value()) >= NumSrcElts * 2)
278 return false;
279 if (SplatIdx == PoisonMaskElem) {
280 SplatIdx = P.value();
281 return P.index() != Mask.size() - 1;
282 }
283 IsCompared = true;
284 return SplatIdx == P.value();
285 })) {
286 Index = SplatIdx;
287 return true;
288 }
289 return false;
290 }
291
292 /// Several intrinsics that return structs (including llvm.sincos[pi] and
293 /// llvm.modf) can be lowered to a vector library call (for certain VFs). The
294 /// vector library functions correspond to the scalar calls (e.g. sincos or
295 /// modf), which unlike the intrinsic return values via output pointers. This
296 /// helper checks if a vector call exists for the given intrinsic, and returns
297 /// the cost, which includes the cost of the mask (if required), and the loads
298 /// for values returned via output pointers. \p LC is the scalar libcall and
299 /// \p CallRetElementIndex (optional) is the struct element which is mapped to
300 /// the call return value. If std::nullopt is returned, then no vector library
301 /// call is available, so the intrinsic should be assigned the default cost
302 /// (e.g. scalarization).
303 std::optional<InstructionCost> getMultipleResultIntrinsicVectorLibCallCost(
305 RTLIB::Libcall LC,
306 std::optional<unsigned> CallRetElementIndex = {}) const {
307 Type *RetTy = ICA.getReturnType();
308 // Vector variants of the intrinsic can be mapped to a vector library call.
309 auto const *LibInfo = ICA.getLibInfo();
310 if (!LibInfo || !isa<StructType>(RetTy) ||
311 !isVectorizedStructTy(cast<StructType>(RetTy)))
312 return std::nullopt;
313
314 // Find associated libcall.
315 const char *LCName = getTLI()->getLibcallName(LC);
316 if (!LCName)
317 return std::nullopt;
318
319 // Search for a corresponding vector variant.
320 LLVMContext &Ctx = RetTy->getContext();
322 VecDesc const *VD = nullptr;
323 for (bool Masked : {false, true}) {
324 if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked)))
325 break;
326 }
327 if (!VD)
328 return std::nullopt;
329
330 // Cost the call + mask.
331 auto Cost =
332 thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
333 if (VD->isMasked()) {
334 auto VecTy = VectorType::get(IntegerType::getInt1Ty(Ctx), VF);
335 Cost += thisT()->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,
336 VecTy, {}, CostKind, 0, nullptr, {});
337 }
338
339 // Lowering to a library call (with output pointers) may require us to emit
340 // reloads for the results.
341 for (auto [Idx, VectorTy] : enumerate(getContainedTypes(RetTy))) {
342 if (Idx == CallRetElementIndex)
343 continue;
344 Cost += thisT()->getMemoryOpCost(
345 Instruction::Load, VectorTy,
346 thisT()->getDataLayout().getABITypeAlign(VectorTy), 0, CostKind);
347 }
348 return Cost;
349 }
350
351 /// Filter out constant and duplicated entries in \p Ops and return a vector
352 /// containing the types from \p Tys corresponding to the remaining operands.
354 filterConstantAndDuplicatedOperands(ArrayRef<const Value *> Ops,
355 ArrayRef<Type *> Tys) {
356 SmallPtrSet<const Value *, 4> UniqueOperands;
357 SmallVector<Type *, 4> FilteredTys;
358 for (const auto &[Op, Ty] : zip_equal(Ops, Tys)) {
359 if (isa<Constant>(Op) || !UniqueOperands.insert(Op).second)
360 continue;
361 FilteredTys.push_back(Ty);
362 }
363 return FilteredTys;
364 }
365
366protected:
367 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
368 : BaseT(DL) {}
369 virtual ~BasicTTIImplBase() = default;
370
372
373public:
374 /// \name Scalar TTI Implementations
375 /// @{
377 unsigned AddressSpace, Align Alignment,
378 unsigned *Fast) const override {
380 return getTLI()->allowsMisalignedMemoryAccesses(
382 }
383
384 bool areInlineCompatible(const Function *Caller,
385 const Function *Callee) const override {
386 const TargetMachine &TM = getTLI()->getTargetMachine();
387
388 const FeatureBitset &CallerBits =
389 TM.getSubtargetImpl(*Caller)->getFeatureBits();
390 const FeatureBitset &CalleeBits =
391 TM.getSubtargetImpl(*Callee)->getFeatureBits();
392
393 // Inline a callee if its target-features are a subset of the callers
394 // target-features.
395 return (CallerBits & CalleeBits) == CalleeBits;
396 }
397
398 bool hasBranchDivergence(const Function *F = nullptr) const override {
399 return false;
400 }
401
402 bool isSourceOfDivergence(const Value *V) const override { return false; }
403
404 bool isAlwaysUniform(const Value *V) const override { return false; }
405
406 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override {
407 return false;
408 }
409
410 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const override {
411 return true;
412 }
413
414 unsigned getFlatAddressSpace() const override {
415 // Return an invalid address space.
416 return -1;
417 }
418
420 Intrinsic::ID IID) const override {
421 return false;
422 }
423
424 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override {
425 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
426 }
427
428 unsigned getAssumedAddrSpace(const Value *V) const override {
429 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
430 }
431
432 bool isSingleThreaded() const override {
433 return getTLI()->getTargetMachine().Options.ThreadModel ==
435 }
436
437 std::pair<const Value *, unsigned>
438 getPredicatedAddrSpace(const Value *V) const override {
439 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
440 }
441
443 Value *NewV) const override {
444 return nullptr;
445 }
446
447 bool isLegalAddImmediate(int64_t imm) const override {
448 return getTLI()->isLegalAddImmediate(imm);
449 }
450
451 bool isLegalAddScalableImmediate(int64_t Imm) const override {
452 return getTLI()->isLegalAddScalableImmediate(Imm);
453 }
454
455 bool isLegalICmpImmediate(int64_t imm) const override {
456 return getTLI()->isLegalICmpImmediate(imm);
457 }
458
459 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
460 bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
461 Instruction *I = nullptr,
462 int64_t ScalableOffset = 0) const override {
464 AM.BaseGV = BaseGV;
465 AM.BaseOffs = BaseOffset;
466 AM.HasBaseReg = HasBaseReg;
467 AM.Scale = Scale;
468 AM.ScalableOffset = ScalableOffset;
469 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
470 }
471
472 int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) {
473 return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset);
474 }
475
476 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
477 Type *ScalarValTy) const override {
478 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
479 auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
480 EVT VT = getTLI()->getValueType(DL, SrcTy);
481 if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
482 getTLI()->isOperationCustom(ISD::STORE, VT))
483 return true;
484
485 EVT ValVT =
486 getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
487 EVT LegalizedVT =
488 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
489 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
490 };
491 while (VF > 2 && IsSupportedByTarget(VF))
492 VF /= 2;
493 return VF;
494 }
495
496 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty) const override {
497 EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
498 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
499 }
500
501 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty) const override {
502 EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
503 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
504 }
505
507 const TTI::LSRCost &C2) const override {
509 }
510
511 bool isNumRegsMajorCostOfLSR() const override {
513 }
514
517 }
518
521 }
522
524 StackOffset BaseOffset, bool HasBaseReg,
525 int64_t Scale,
526 unsigned AddrSpace) const override {
528 AM.BaseGV = BaseGV;
529 AM.BaseOffs = BaseOffset.getFixed();
530 AM.HasBaseReg = HasBaseReg;
531 AM.Scale = Scale;
532 AM.ScalableOffset = BaseOffset.getScalable();
533 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
534 return 0;
536 }
537
538 bool isTruncateFree(Type *Ty1, Type *Ty2) const override {
539 return getTLI()->isTruncateFree(Ty1, Ty2);
540 }
541
542 bool isProfitableToHoist(Instruction *I) const override {
543 return getTLI()->isProfitableToHoist(I);
544 }
545
546 bool useAA() const override { return getST()->useAA(); }
547
548 bool isTypeLegal(Type *Ty) const override {
549 EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
550 return getTLI()->isTypeLegal(VT);
551 }
552
553 unsigned getRegUsageForType(Type *Ty) const override {
554 EVT ETy = getTLI()->getValueType(DL, Ty);
555 return getTLI()->getNumRegisters(Ty->getContext(), ETy);
556 }
557
560 TTI::TargetCostKind CostKind) const override {
561 return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind);
562 }
563
565 const SwitchInst &SI, unsigned &JumpTableSize, ProfileSummaryInfo *PSI,
566 BlockFrequencyInfo *BFI) const override {
567 /// Try to find the estimated number of clusters. Note that the number of
568 /// clusters identified in this function could be different from the actual
569 /// numbers found in lowering. This function ignore switches that are
570 /// lowered with a mix of jump table / bit test / BTree. This function was
571 /// initially intended to be used when estimating the cost of switch in
572 /// inline cost heuristic, but it's a generic cost model to be used in other
573 /// places (e.g., in loop unrolling).
574 unsigned N = SI.getNumCases();
575 const TargetLoweringBase *TLI = getTLI();
576 const DataLayout &DL = this->getDataLayout();
577
578 JumpTableSize = 0;
579 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
580
581 // Early exit if both a jump table and bit test are not allowed.
582 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
583 return N;
584
585 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
586 APInt MinCaseVal = MaxCaseVal;
587 for (auto CI : SI.cases()) {
588 const APInt &CaseVal = CI.getCaseValue()->getValue();
589 if (CaseVal.sgt(MaxCaseVal))
590 MaxCaseVal = CaseVal;
591 if (CaseVal.slt(MinCaseVal))
592 MinCaseVal = CaseVal;
593 }
594
595 // Check if suitable for a bit test
596 if (N <= DL.getIndexSizeInBits(0u)) {
598 for (auto I : SI.cases())
599 Dests.insert(I.getCaseSuccessor());
600
601 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
602 DL))
603 return 1;
604 }
605
606 // Check if suitable for a jump table.
607 if (IsJTAllowed) {
608 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
609 return N;
611 (MaxCaseVal - MinCaseVal)
612 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
613 // Check whether a range of clusters is dense enough for a jump table
614 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
615 JumpTableSize = Range;
616 return 1;
617 }
618 }
619 return N;
620 }
621
622 bool shouldBuildLookupTables() const override {
623 const TargetLoweringBase *TLI = getTLI();
624 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
625 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
626 }
627
628 bool shouldBuildRelLookupTables() const override {
629 const TargetMachine &TM = getTLI()->getTargetMachine();
630 // If non-PIC mode, do not generate a relative lookup table.
631 if (!TM.isPositionIndependent())
632 return false;
633
634 /// Relative lookup table entries consist of 32-bit offsets.
635 /// Do not generate relative lookup tables for large code models
636 /// in 64-bit achitectures where 32-bit offsets might not be enough.
637 if (TM.getCodeModel() == CodeModel::Medium ||
638 TM.getCodeModel() == CodeModel::Large)
639 return false;
640
641 const Triple &TargetTriple = TM.getTargetTriple();
642 if (!TargetTriple.isArch64Bit())
643 return false;
644
645 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
646 // there.
647 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
648 return false;
649
650 return true;
651 }
652
653 bool haveFastSqrt(Type *Ty) const override {
654 const TargetLoweringBase *TLI = getTLI();
655 EVT VT = TLI->getValueType(DL, Ty);
656 return TLI->isTypeLegal(VT) &&
658 }
659
660 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const override { return true; }
661
662 InstructionCost getFPOpCost(Type *Ty) const override {
663 // Check whether FADD is available, as a proxy for floating-point in
664 // general.
665 const TargetLoweringBase *TLI = getTLI();
666 EVT VT = TLI->getValueType(DL, Ty);
670 }
671
673 const Function &Fn) const override {
674 switch (Inst.getOpcode()) {
675 default:
676 break;
677 case Instruction::SDiv:
678 case Instruction::SRem:
679 case Instruction::UDiv:
680 case Instruction::URem: {
681 if (!isa<ConstantInt>(Inst.getOperand(1)))
682 return false;
683 EVT VT = getTLI()->getValueType(DL, Inst.getType());
684 return !getTLI()->isIntDivCheap(VT, Fn.getAttributes());
685 }
686 };
687
688 return false;
689 }
690
691 unsigned getInliningThresholdMultiplier() const override { return 1; }
692 unsigned adjustInliningThreshold(const CallBase *CB) const override {
693 return 0;
694 }
695 unsigned getCallerAllocaCost(const CallBase *CB,
696 const AllocaInst *AI) const override {
697 return 0;
698 }
699
700 int getInlinerVectorBonusPercent() const override { return 150; }
701
704 OptimizationRemarkEmitter *ORE) const override {
705 // This unrolling functionality is target independent, but to provide some
706 // motivation for its intended use, for x86:
707
708 // According to the Intel 64 and IA-32 Architectures Optimization Reference
709 // Manual, Intel Core models and later have a loop stream detector (and
710 // associated uop queue) that can benefit from partial unrolling.
711 // The relevant requirements are:
712 // - The loop must have no more than 4 (8 for Nehalem and later) branches
713 // taken, and none of them may be calls.
714 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
715
716 // According to the Software Optimization Guide for AMD Family 15h
717 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
718 // and loop buffer which can benefit from partial unrolling.
719 // The relevant requirements are:
720 // - The loop must have fewer than 16 branches
721 // - The loop must have less than 40 uops in all executed loop branches
722
723 // The number of taken branches in a loop is hard to estimate here, and
724 // benchmarking has revealed that it is better not to be conservative when
725 // estimating the branch count. As a result, we'll ignore the branch limits
726 // until someone finds a case where it matters in practice.
727
728 unsigned MaxOps;
729 const TargetSubtargetInfo *ST = getST();
730 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
732 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
733 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
734 else
735 return;
736
737 // Scan the loop: don't unroll loops with calls.
738 for (BasicBlock *BB : L->blocks()) {
739 for (Instruction &I : *BB) {
740 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
741 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
742 if (!thisT()->isLoweredToCall(F))
743 continue;
744 }
745
746 if (ORE) {
747 ORE->emit([&]() {
748 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
749 L->getHeader())
750 << "advising against unrolling the loop because it "
751 "contains a "
752 << ore::NV("Call", &I);
753 });
754 }
755 return;
756 }
757 }
758 }
759
760 // Enable runtime and partial unrolling up to the specified size.
761 // Enable using trip count upper bound to unroll loops.
762 UP.Partial = UP.Runtime = UP.UpperBound = true;
763 UP.PartialThreshold = MaxOps;
764
765 // Avoid unrolling when optimizing for size.
766 UP.OptSizeThreshold = 0;
768
769 // Set number of instructions optimized when "back edge"
770 // becomes "fall through" to default value of 2.
771 UP.BEInsns = 2;
772 }
773
775 TTI::PeelingPreferences &PP) const override {
776 PP.PeelCount = 0;
777 PP.AllowPeeling = true;
778 PP.AllowLoopNestsPeeling = false;
779 PP.PeelProfiledIterations = true;
780 }
781
784 HardwareLoopInfo &HWLoopInfo) const override {
785 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
786 }
787
788 unsigned getEpilogueVectorizationMinVF() const override {
790 }
791
794 }
795
797 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const override {
798 return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
799 }
800
801 std::optional<Instruction *>
804 }
805
806 std::optional<Value *>
808 APInt DemandedMask, KnownBits &Known,
809 bool &KnownBitsComputed) const override {
810 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
811 KnownBitsComputed);
812 }
813
815 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
816 APInt &UndefElts2, APInt &UndefElts3,
817 std::function<void(Instruction *, unsigned, APInt, APInt &)>
818 SimplifyAndSetOp) const override {
820 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
821 SimplifyAndSetOp);
822 }
823
824 virtual std::optional<unsigned>
826 return std::optional<unsigned>(
827 getST()->getCacheSize(static_cast<unsigned>(Level)));
828 }
829
830 virtual std::optional<unsigned>
832 std::optional<unsigned> TargetResult =
833 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
834
835 if (TargetResult)
836 return TargetResult;
837
838 return BaseT::getCacheAssociativity(Level);
839 }
840
841 virtual unsigned getCacheLineSize() const override {
842 return getST()->getCacheLineSize();
843 }
844
845 virtual unsigned getPrefetchDistance() const override {
846 return getST()->getPrefetchDistance();
847 }
848
849 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
850 unsigned NumStridedMemAccesses,
851 unsigned NumPrefetches,
852 bool HasCall) const override {
853 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
854 NumPrefetches, HasCall);
855 }
856
857 virtual unsigned getMaxPrefetchIterationsAhead() const override {
858 return getST()->getMaxPrefetchIterationsAhead();
859 }
860
861 virtual bool enableWritePrefetching() const override {
862 return getST()->enableWritePrefetching();
863 }
864
865 virtual bool shouldPrefetchAddressSpace(unsigned AS) const override {
866 return getST()->shouldPrefetchAddressSpace(AS);
867 }
868
869 /// @}
870
871 /// \name Vector TTI Implementations
872 /// @{
873
876 return TypeSize::getFixed(32);
877 }
878
879 std::optional<unsigned> getMaxVScale() const override { return std::nullopt; }
880 std::optional<unsigned> getVScaleForTuning() const override {
881 return std::nullopt;
882 }
883 bool isVScaleKnownToBeAPowerOfTwo() const override { return false; }
884
885 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
886 /// are set if the demanded result elements need to be inserted and/or
887 /// extracted from vectors.
889 VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract,
890 TTI::TargetCostKind CostKind, bool ForPoisonSrc = true,
891 ArrayRef<Value *> VL = {}) const override {
892 /// FIXME: a bitfield is not a reasonable abstraction for talking about
893 /// which elements are needed from a scalable vector
894 if (isa<ScalableVectorType>(InTy))
896 auto *Ty = cast<FixedVectorType>(InTy);
897
898 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
899 (VL.empty() || VL.size() == Ty->getNumElements()) &&
900 "Vector size mismatch");
901
903
904 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
905 if (!DemandedElts[i])
906 continue;
907 if (Insert) {
908 Value *InsertedVal = VL.empty() ? nullptr : VL[i];
909 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
910 CostKind, i, nullptr, InsertedVal);
911 }
912 if (Extract)
913 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
914 CostKind, i, nullptr, nullptr);
915 }
916
917 return Cost;
918 }
919
921 return false;
922 }
923
924 bool
926 unsigned ScalarOpdIdx) const override {
927 return false;
928 }
929
931 int OpdIdx) const override {
932 return OpdIdx == -1;
933 }
934
935 bool
937 int RetIdx) const override {
938 return RetIdx == 0;
939 }
940
941 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
943 bool Extract,
945 if (isa<ScalableVectorType>(InTy))
947 auto *Ty = cast<FixedVectorType>(InTy);
948
949 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
950 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
951 CostKind);
952 }
953
954 /// Estimate the overhead of scalarizing an instruction's
955 /// operands. The (potentially vector) types to use for each of
956 /// argument are passes via Tys.
958 ArrayRef<Type *> Tys, TTI::TargetCostKind CostKind) const override {
960 for (Type *Ty : Tys) {
961 // Disregard things like metadata arguments.
962 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
963 !Ty->isPtrOrPtrVectorTy())
964 continue;
965
966 if (auto *VecTy = dyn_cast<VectorType>(Ty))
967 Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
968 /*Extract*/ true, CostKind);
969 }
970
971 return Cost;
972 }
973
974 /// Estimate the overhead of scalarizing the inputs and outputs of an
975 /// instruction, with return type RetTy and arguments Args of type Tys. If
976 /// Args are unknown (empty), then the cost associated with one argument is
977 /// added as a heuristic.
983 RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
984 if (!Args.empty())
986 filterConstantAndDuplicatedOperands(Args, Tys), CostKind);
987 else
988 // When no information on arguments is provided, we add the cost
989 // associated with one argument as a heuristic.
990 Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
991 /*Extract*/ true, CostKind);
992
993 return Cost;
994 }
995
996 /// Estimate the cost of type-legalization and the legalized type.
997 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
998 LLVMContext &C = Ty->getContext();
999 EVT MTy = getTLI()->getValueType(DL, Ty);
1000
1002 // We keep legalizing the type until we find a legal kind. We assume that
1003 // the only operation that costs anything is the split. After splitting
1004 // we need to handle two types.
1005 while (true) {
1007
1009 // Ensure we return a sensible simple VT here, since many callers of
1010 // this function require it.
1011 MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
1012 return std::make_pair(InstructionCost::getInvalid(), VT);
1013 }
1014
1015 if (LK.first == TargetLoweringBase::TypeLegal)
1016 return std::make_pair(Cost, MTy.getSimpleVT());
1017
1018 if (LK.first == TargetLoweringBase::TypeSplitVector ||
1020 Cost *= 2;
1021
1022 // Do not loop with f128 type.
1023 if (MTy == LK.second)
1024 return std::make_pair(Cost, MTy.getSimpleVT());
1025
1026 // Keep legalizing the type.
1027 MTy = LK.second;
1028 }
1029 }
1030
1031 unsigned getMaxInterleaveFactor(ElementCount VF) const override { return 1; }
1032
1034 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1037 ArrayRef<const Value *> Args = {},
1038 const Instruction *CxtI = nullptr) const override {
1039 // Check if any of the operands are vector operands.
1040 const TargetLoweringBase *TLI = getTLI();
1041 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1042 assert(ISD && "Invalid opcode");
1043
1044 // TODO: Handle more cost kinds.
1046 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
1047 Opd1Info, Opd2Info,
1048 Args, CxtI);
1049
1050 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1051
1052 bool IsFloat = Ty->isFPOrFPVectorTy();
1053 // Assume that floating point arithmetic operations cost twice as much as
1054 // integer operations.
1055 InstructionCost OpCost = (IsFloat ? 2 : 1);
1056
1057 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
1058 // The operation is legal. Assume it costs 1.
1059 // TODO: Once we have extract/insert subvector cost we need to use them.
1060 return LT.first * OpCost;
1061 }
1062
1063 if (!TLI->isOperationExpand(ISD, LT.second)) {
1064 // If the operation is custom lowered, then assume that the code is twice
1065 // as expensive.
1066 return LT.first * 2 * OpCost;
1067 }
1068
1069 // An 'Expand' of URem and SRem is special because it may default
1070 // to expanding the operation into a sequence of sub-operations
1071 // i.e. X % Y -> X-(X/Y)*Y.
1072 if (ISD == ISD::UREM || ISD == ISD::SREM) {
1073 bool IsSigned = ISD == ISD::SREM;
1074 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
1075 LT.second) ||
1076 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
1077 LT.second)) {
1078 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
1079 InstructionCost DivCost = thisT()->getArithmeticInstrCost(
1080 DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
1081 InstructionCost MulCost =
1082 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
1083 InstructionCost SubCost =
1084 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
1085 return DivCost + MulCost + SubCost;
1086 }
1087 }
1088
1089 // We cannot scalarize scalable vectors, so return Invalid.
1090 if (isa<ScalableVectorType>(Ty))
1092
1093 // Else, assume that we need to scalarize this op.
1094 // TODO: If one of the types get legalized by splitting, handle this
1095 // similarly to what getCastInstrCost() does.
1096 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1097 InstructionCost Cost = thisT()->getArithmeticInstrCost(
1098 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
1099 Args, CxtI);
1100 // Return the cost of multiple scalar invocation plus the cost of
1101 // inserting and extracting the values.
1102 SmallVector<Type *> Tys(Args.size(), Ty);
1103 return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1104 VTy->getNumElements() * Cost;
1105 }
1106
1107 // We don't know anything about this scalar instruction.
1108 return OpCost;
1109 }
1110
1112 ArrayRef<int> Mask,
1113 VectorType *SrcTy, int &Index,
1114 VectorType *&SubTy) const {
1115 if (Mask.empty())
1116 return Kind;
1117 int NumDstElts = Mask.size();
1118 int NumSrcElts = SrcTy->getElementCount().getKnownMinValue();
1119 switch (Kind) {
1121 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
1122 return TTI::SK_Reverse;
1123 if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
1124 return TTI::SK_Broadcast;
1125 if (isSplatMask(Mask, NumSrcElts, Index))
1126 return TTI::SK_Broadcast;
1127 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
1128 (Index + NumDstElts) <= NumSrcElts) {
1129 SubTy = FixedVectorType::get(SrcTy->getElementType(), NumDstElts);
1131 }
1132 break;
1133 }
1134 case TTI::SK_PermuteTwoSrc: {
1135 if (all_of(Mask, [NumSrcElts](int M) { return M < NumSrcElts; }))
1137 Index, SubTy);
1138 int NumSubElts;
1139 if (NumDstElts > 2 && ShuffleVectorInst::isInsertSubvectorMask(
1140 Mask, NumSrcElts, NumSubElts, Index)) {
1141 if (Index + NumSubElts > NumSrcElts)
1142 return Kind;
1143 SubTy = FixedVectorType::get(SrcTy->getElementType(), NumSubElts);
1145 }
1146 if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts))
1147 return TTI::SK_Select;
1148 if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts))
1149 return TTI::SK_Transpose;
1150 if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index))
1151 return TTI::SK_Splice;
1152 break;
1153 }
1154 case TTI::SK_Select:
1155 case TTI::SK_Reverse:
1156 case TTI::SK_Broadcast:
1157 case TTI::SK_Transpose:
1160 case TTI::SK_Splice:
1161 break;
1162 }
1163 return Kind;
1164 }
1165
1169 VectorType *SubTp, ArrayRef<const Value *> Args = {},
1170 const Instruction *CxtI = nullptr) const override {
1171 switch (improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp)) {
1172 case TTI::SK_Broadcast:
1173 if (auto *FVT = dyn_cast<FixedVectorType>(SrcTy))
1174 return getBroadcastShuffleOverhead(FVT, CostKind);
1176 case TTI::SK_Select:
1177 case TTI::SK_Splice:
1178 case TTI::SK_Reverse:
1179 case TTI::SK_Transpose:
1182 if (auto *FVT = dyn_cast<FixedVectorType>(SrcTy))
1183 return getPermuteShuffleOverhead(FVT, CostKind);
1186 return getExtractSubvectorOverhead(SrcTy, CostKind, Index,
1187 cast<FixedVectorType>(SubTp));
1189 return getInsertSubvectorOverhead(DstTy, CostKind, Index,
1190 cast<FixedVectorType>(SubTp));
1191 }
1192 llvm_unreachable("Unknown TTI::ShuffleKind");
1193 }
1194
1196 getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1198 const Instruction *I = nullptr) const override {
1199 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
1200 return 0;
1201
1202 const TargetLoweringBase *TLI = getTLI();
1203 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1204 assert(ISD && "Invalid opcode");
1205 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1206 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1207
1208 TypeSize SrcSize = SrcLT.second.getSizeInBits();
1209 TypeSize DstSize = DstLT.second.getSizeInBits();
1210 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1211 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1212
1213 switch (Opcode) {
1214 default:
1215 break;
1216 case Instruction::Trunc:
1217 // Check for NOOP conversions.
1218 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
1219 return 0;
1220 [[fallthrough]];
1221 case Instruction::BitCast:
1222 // Bitcast between types that are legalized to the same type are free and
1223 // assume int to/from ptr of the same size is also free.
1224 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1225 SrcSize == DstSize)
1226 return 0;
1227 break;
1228 case Instruction::FPExt:
1229 if (I && getTLI()->isExtFree(I))
1230 return 0;
1231 break;
1232 case Instruction::ZExt:
1233 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
1234 return 0;
1235 [[fallthrough]];
1236 case Instruction::SExt:
1237 if (I && getTLI()->isExtFree(I))
1238 return 0;
1239
1240 // If this is a zext/sext of a load, return 0 if the corresponding
1241 // extending load exists on target and the result type is legal.
1242 if (CCH == TTI::CastContextHint::Normal) {
1243 EVT ExtVT = EVT::getEVT(Dst);
1244 EVT LoadVT = EVT::getEVT(Src);
1245 unsigned LType =
1246 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1247 if (DstLT.first == SrcLT.first &&
1248 TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
1249 return 0;
1250 }
1251 break;
1252 case Instruction::AddrSpaceCast:
1253 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
1254 Dst->getPointerAddressSpace()))
1255 return 0;
1256 break;
1257 }
1258
1259 auto *SrcVTy = dyn_cast<VectorType>(Src);
1260 auto *DstVTy = dyn_cast<VectorType>(Dst);
1261
1262 // If the cast is marked as legal (or promote) then assume low cost.
1263 if (SrcLT.first == DstLT.first &&
1264 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1265 return SrcLT.first;
1266
1267 // Handle scalar conversions.
1268 if (!SrcVTy && !DstVTy) {
1269 // Just check the op cost. If the operation is legal then assume it costs
1270 // 1.
1271 if (!TLI->isOperationExpand(ISD, DstLT.second))
1272 return 1;
1273
1274 // Assume that illegal scalar instruction are expensive.
1275 return 4;
1276 }
1277
1278 // Check vector-to-vector casts.
1279 if (DstVTy && SrcVTy) {
1280 // If the cast is between same-sized registers, then the check is simple.
1281 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1282
1283 // Assume that Zext is done using AND.
1284 if (Opcode == Instruction::ZExt)
1285 return SrcLT.first;
1286
1287 // Assume that sext is done using SHL and SRA.
1288 if (Opcode == Instruction::SExt)
1289 return SrcLT.first * 2;
1290
1291 // Just check the op cost. If the operation is legal then assume it
1292 // costs
1293 // 1 and multiply by the type-legalization overhead.
1294 if (!TLI->isOperationExpand(ISD, DstLT.second))
1295 return SrcLT.first * 1;
1296 }
1297
1298 // If we are legalizing by splitting, query the concrete TTI for the cost
1299 // of casting the original vector twice. We also need to factor in the
1300 // cost of the split itself. Count that as 1, to be consistent with
1301 // getTypeLegalizationCost().
1302 bool SplitSrc =
1303 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1305 bool SplitDst =
1306 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1308 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1309 DstVTy->getElementCount().isVector()) {
1310 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1311 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1312 const T *TTI = thisT();
1313 // If both types need to be split then the split is free.
1314 InstructionCost SplitCost =
1315 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1316 return SplitCost +
1317 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1318 CostKind, I));
1319 }
1320
1321 // Scalarization cost is Invalid, can't assume any num elements.
1322 if (isa<ScalableVectorType>(DstVTy))
1324
1325 // In other cases where the source or destination are illegal, assume
1326 // the operation will get scalarized.
1327 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1328 InstructionCost Cost = thisT()->getCastInstrCost(
1329 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1330
1331 // Return the cost of multiple scalar invocation plus the cost of
1332 // inserting and extracting the values.
1333 return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1334 CostKind) +
1335 Num * Cost;
1336 }
1337
1338 // We already handled vector-to-vector and scalar-to-scalar conversions.
1339 // This
1340 // is where we handle bitcast between vectors and scalars. We need to assume
1341 // that the conversion is scalarized in one way or another.
1342 if (Opcode == Instruction::BitCast) {
1343 // Illegal bitcasts are done by storing and loading from a stack slot.
1344 return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1345 /*Extract*/ true, CostKind)
1346 : 0) +
1347 (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1348 /*Extract*/ false, CostKind)
1349 : 0);
1350 }
1351
1352 llvm_unreachable("Unhandled cast");
1353 }
1354
1356 getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
1357 unsigned Index,
1358 TTI::TargetCostKind CostKind) const override {
1359 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1360 CostKind, Index, nullptr, nullptr) +
1361 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1363 }
1364
1367 const Instruction *I = nullptr) const override {
1368 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1369 }
1370
1372 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
1376 const Instruction *I = nullptr) const override {
1377 const TargetLoweringBase *TLI = getTLI();
1378 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1379 assert(ISD && "Invalid opcode");
1380
1381 if (getTLI()->getValueType(DL, ValTy, true) == MVT::Other)
1382 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1383 Op1Info, Op2Info, I);
1384
1385 // Selects on vectors are actually vector selects.
1386 if (ISD == ISD::SELECT) {
1387 assert(CondTy && "CondTy must exist");
1388 if (CondTy->isVectorTy())
1389 ISD = ISD::VSELECT;
1390 }
1391 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1392
1393 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1394 !TLI->isOperationExpand(ISD, LT.second)) {
1395 // The operation is legal. Assume it costs 1. Multiply
1396 // by the type-legalization overhead.
1397 return LT.first * 1;
1398 }
1399
1400 // Otherwise, assume that the cast is scalarized.
1401 // TODO: If one of the types get legalized by splitting, handle this
1402 // similarly to what getCastInstrCost() does.
1403 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1404 if (isa<ScalableVectorType>(ValTy))
1406
1407 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1408 InstructionCost Cost = thisT()->getCmpSelInstrCost(
1409 Opcode, ValVTy->getScalarType(), CondTy->getScalarType(), VecPred,
1410 CostKind, Op1Info, Op2Info, I);
1411
1412 // Return the cost of multiple scalar invocation plus the cost of
1413 // inserting and extracting the values.
1414 return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1415 /*Extract*/ false, CostKind) +
1416 Num * Cost;
1417 }
1418
1419 // Unknown scalar opcode.
1420 return 1;
1421 }
1422
1425 unsigned Index, const Value *Op0,
1426 const Value *Op1) const override {
1427 return getRegUsageForType(Val->getScalarType());
1428 }
1429
1430 /// \param ScalarUserAndIdx encodes the information about extracts from a
1431 /// vector with 'Scalar' being the value being extracted,'User' being the user
1432 /// of the extract(nullptr if user is not known before vectorization) and
1433 /// 'Idx' being the extract lane.
1436 unsigned Index, Value *Scalar,
1437 ArrayRef<std::tuple<Value *, User *, int>>
1438 ScalarUserAndIdx) const override {
1439 return thisT()->getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr,
1440 nullptr);
1441 }
1442
1445 unsigned Index) const override {
1446 Value *Op0 = nullptr;
1447 Value *Op1 = nullptr;
1448 if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
1449 Op0 = IE->getOperand(0);
1450 Op1 = IE->getOperand(1);
1451 }
1452 return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1453 Op1);
1454 }
1455
1459 unsigned Index) const override {
1460 unsigned NewIndex = -1;
1461 if (auto *FVTy = dyn_cast<FixedVectorType>(Val)) {
1462 assert(Index < FVTy->getNumElements() &&
1463 "Unexpected index from end of vector");
1464 NewIndex = FVTy->getNumElements() - 1 - Index;
1465 }
1466 return thisT()->getVectorInstrCost(Opcode, Val, CostKind, NewIndex, nullptr,
1467 nullptr);
1468 }
1469
1471 getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
1472 const APInt &DemandedDstElts,
1473 TTI::TargetCostKind CostKind) const override {
1474 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1475 "Unexpected size of DemandedDstElts.");
1476
1478
1479 auto *SrcVT = FixedVectorType::get(EltTy, VF);
1480 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1481
1482 // The Mask shuffling cost is extract all the elements of the Mask
1483 // and insert each of them Factor times into the wide vector:
1484 //
1485 // E.g. an interleaved group with factor 3:
1486 // %mask = icmp ult <8 x i32> %vec1, %vec2
1487 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1488 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1489 // The cost is estimated as extract all mask elements from the <8xi1> mask
1490 // vector and insert them factor times into the <24xi1> shuffled mask
1491 // vector.
1492 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1493 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1494 /*Insert*/ false,
1495 /*Extract*/ true, CostKind);
1496 Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1497 /*Insert*/ true,
1498 /*Extract*/ false, CostKind);
1499
1500 return Cost;
1501 }
1502
1504 unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
1507 const Instruction *I = nullptr) const override {
1508 assert(!Src->isVoidTy() && "Invalid type");
1509 // Assume types, such as structs, are expensive.
1510 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1511 return 4;
1512 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1513
1514 // Assuming that all loads of legal types cost 1.
1515 InstructionCost Cost = LT.first;
1517 return Cost;
1518
1519 const DataLayout &DL = this->getDataLayout();
1520 if (Src->isVectorTy() &&
1521 // In practice it's not currently possible to have a change in lane
1522 // length for extending loads or truncating stores so both types should
1523 // have the same scalable property.
1525 LT.second.getSizeInBits())) {
1526 // This is a vector load that legalizes to a larger type than the vector
1527 // itself. Unless the corresponding extending load or truncating store is
1528 // legal, then this will scalarize.
1530 EVT MemVT = getTLI()->getValueType(DL, Src);
1531 if (Opcode == Instruction::Store)
1532 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1533 else
1534 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1535
1536 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1537 // This is a vector load/store for some illegal type that is scalarized.
1538 // We must account for the cost of building or decomposing the vector.
1540 cast<VectorType>(Src), Opcode != Instruction::Store,
1541 Opcode == Instruction::Store, CostKind);
1542 }
1543 }
1544
1545 return Cost;
1546 }
1547
1549 getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment,
1550 unsigned AddressSpace,
1551 TTI::TargetCostKind CostKind) const override {
1552 // TODO: Pass on AddressSpace when we have test coverage.
1553 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1554 CostKind);
1555 }
1556
1558 getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
1559 bool VariableMask, Align Alignment,
1561 const Instruction *I = nullptr) const override {
1562 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1563 true, CostKind);
1564 }
1565
1567 getExpandCompressMemoryOpCost(unsigned Opcode, Type *DataTy,
1568 bool VariableMask, Align Alignment,
1570 const Instruction *I = nullptr) const override {
1571 // Treat expand load/compress store as gather/scatter operation.
1572 // TODO: implement more precise cost estimation for these intrinsics.
1573 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1574 /*IsGatherScatter*/ true, CostKind);
1575 }
1576
1578 const Value *Ptr, bool VariableMask,
1579 Align Alignment,
1581 const Instruction *I) const override {
1582 // For a target without strided memory operations (or for an illegal
1583 // operation type on one which does), assume we lower to a gather/scatter
1584 // operation. (Which may in turn be scalarized.)
1585 return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1586 Alignment, CostKind, I);
1587 }
1588
1590 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1591 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1592 bool UseMaskForCond = false, bool UseMaskForGaps = false) const override {
1593
1594 // We cannot scalarize scalable vectors, so return Invalid.
1595 if (isa<ScalableVectorType>(VecTy))
1597
1598 auto *VT = cast<FixedVectorType>(VecTy);
1599
1600 unsigned NumElts = VT->getNumElements();
1601 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1602
1603 unsigned NumSubElts = NumElts / Factor;
1604 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1605
1606 // Firstly, the cost of load/store operation.
1608 if (UseMaskForCond || UseMaskForGaps)
1609 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1611 else
1612 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1613 CostKind);
1614
1615 // Legalize the vector type, and get the legalized and unlegalized type
1616 // sizes.
1617 MVT VecTyLT = getTypeLegalizationCost(VecTy).second;
1618 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1619 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1620
1621 // Scale the cost of the memory operation by the fraction of legalized
1622 // instructions that will actually be used. We shouldn't account for the
1623 // cost of dead instructions since they will be removed.
1624 //
1625 // E.g., An interleaved load of factor 8:
1626 // %vec = load <16 x i64>, <16 x i64>* %ptr
1627 // %v0 = shufflevector %vec, undef, <0, 8>
1628 //
1629 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1630 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1631 // type). The other loads are unused.
1632 //
1633 // TODO: Note that legalization can turn masked loads/stores into unmasked
1634 // (legalized) loads/stores. This can be reflected in the cost.
1635 if (Cost.isValid() && VecTySize > VecTyLTSize) {
1636 // The number of loads of a legal type it will take to represent a load
1637 // of the unlegalized vector type.
1638 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1639
1640 // The number of elements of the unlegalized type that correspond to a
1641 // single legal instruction.
1642 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1643
1644 // Determine which legal instructions will be used.
1645 BitVector UsedInsts(NumLegalInsts, false);
1646 for (unsigned Index : Indices)
1647 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1648 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1649
1650 // Scale the cost of the load by the fraction of legal instructions that
1651 // will be used.
1652 Cost = divideCeil(UsedInsts.count() * Cost.getValue(), NumLegalInsts);
1653 }
1654
1655 // Then plus the cost of interleave operation.
1656 assert(Indices.size() <= Factor &&
1657 "Interleaved memory op has too many members");
1658
1659 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1660 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1661
1662 APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1663 for (unsigned Index : Indices) {
1664 assert(Index < Factor && "Invalid index for interleaved memory op");
1665 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1666 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1667 }
1668
1669 if (Opcode == Instruction::Load) {
1670 // The interleave cost is similar to extract sub vectors' elements
1671 // from the wide vector, and insert them into sub vectors.
1672 //
1673 // E.g. An interleaved load of factor 2 (with one member of index 0):
1674 // %vec = load <8 x i32>, <8 x i32>* %ptr
1675 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1676 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1677 // <8 x i32> vector and insert them into a <4 x i32> vector.
1678 InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1679 SubVT, DemandedAllSubElts,
1680 /*Insert*/ true, /*Extract*/ false, CostKind);
1681 Cost += Indices.size() * InsSubCost;
1682 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1683 /*Insert*/ false,
1684 /*Extract*/ true, CostKind);
1685 } else {
1686 // The interleave cost is extract elements from sub vectors, and
1687 // insert them into the wide vector.
1688 //
1689 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1690 // (using VF=4):
1691 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1692 // %gaps.mask = <true, true, false, true, true, false,
1693 // true, true, false, true, true, false>
1694 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1695 // i32 Align, <12 x i1> %gaps.mask
1696 // The cost is estimated as extract all elements (of actual members,
1697 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1698 // i32> vector.
1699 InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1700 SubVT, DemandedAllSubElts,
1701 /*Insert*/ false, /*Extract*/ true, CostKind);
1702 Cost += ExtSubCost * Indices.size();
1703 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1704 /*Insert*/ true,
1705 /*Extract*/ false, CostKind);
1706 }
1707
1708 if (!UseMaskForCond)
1709 return Cost;
1710
1711 Type *I8Type = Type::getInt8Ty(VT->getContext());
1712
1713 Cost += thisT()->getReplicationShuffleCost(
1714 I8Type, Factor, NumSubElts,
1715 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1716 CostKind);
1717
1718 // The Gaps mask is invariant and created outside the loop, therefore the
1719 // cost of creating it is not accounted for here. However if we have both
1720 // a MaskForGaps and some other mask that guards the execution of the
1721 // memory access, we need to account for the cost of And-ing the two masks
1722 // inside the loop.
1723 if (UseMaskForGaps) {
1724 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1725 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1726 CostKind);
1727 }
1728
1729 return Cost;
1730 }
1731
1732 /// Get intrinsic cost based on arguments.
1735 TTI::TargetCostKind CostKind) const override {
1736 // Check for generically free intrinsics.
1738 return 0;
1739
1740 // Assume that target intrinsics are cheap.
1741 Intrinsic::ID IID = ICA.getID();
1744
1745 // VP Intrinsics should have the same cost as their non-vp counterpart.
1746 // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
1747 // counterpart when the vector length argument is smaller than the maximum
1748 // vector length.
1749 // TODO: Support other kinds of VPIntrinsics
1750 if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
1751 std::optional<unsigned> FOp =
1753 if (FOp) {
1754 if (ICA.getID() == Intrinsic::vp_load) {
1755 Align Alignment;
1756 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1757 Alignment = VPI->getPointerAlignment().valueOrOne();
1758 unsigned AS = 0;
1759 if (ICA.getArgTypes().size() > 1)
1760 if (auto *PtrTy = dyn_cast<PointerType>(ICA.getArgTypes()[0]))
1761 AS = PtrTy->getAddressSpace();
1762 return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
1763 AS, CostKind);
1764 }
1765 if (ICA.getID() == Intrinsic::vp_store) {
1766 Align Alignment;
1767 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1768 Alignment = VPI->getPointerAlignment().valueOrOne();
1769 unsigned AS = 0;
1770 if (ICA.getArgTypes().size() >= 2)
1771 if (auto *PtrTy = dyn_cast<PointerType>(ICA.getArgTypes()[1]))
1772 AS = PtrTy->getAddressSpace();
1773 return thisT()->getMemoryOpCost(*FOp, ICA.getArgTypes()[0], Alignment,
1774 AS, CostKind);
1775 }
1777 ICA.getID() == Intrinsic::vp_fneg) {
1778 return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
1779 CostKind);
1780 }
1781 if (VPCastIntrinsic::isVPCast(ICA.getID())) {
1782 return thisT()->getCastInstrCost(
1783 *FOp, ICA.getReturnType(), ICA.getArgTypes()[0],
1785 }
1786 if (VPCmpIntrinsic::isVPCmp(ICA.getID())) {
1787 // We can only handle vp_cmp intrinsics with underlying instructions.
1788 if (ICA.getInst()) {
1789 assert(FOp);
1790 auto *UI = cast<VPCmpIntrinsic>(ICA.getInst());
1791 return thisT()->getCmpSelInstrCost(*FOp, ICA.getArgTypes()[0],
1792 ICA.getReturnType(),
1793 UI->getPredicate(), CostKind);
1794 }
1795 }
1796 }
1797
1798 if (ICA.getID() == Intrinsic::vp_scatter) {
1799 if (ICA.isTypeBasedOnly()) {
1800 IntrinsicCostAttributes MaskedScatter(
1803 ICA.getFlags());
1804 return getTypeBasedIntrinsicInstrCost(MaskedScatter, CostKind);
1805 }
1806 Align Alignment;
1807 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1808 Alignment = VPI->getPointerAlignment().valueOrOne();
1809 bool VarMask = isa<Constant>(ICA.getArgs()[2]);
1810 return thisT()->getGatherScatterOpCost(
1811 Instruction::Store, ICA.getArgTypes()[0], ICA.getArgs()[1], VarMask,
1812 Alignment, CostKind, nullptr);
1813 }
1814 if (ICA.getID() == Intrinsic::vp_gather) {
1815 if (ICA.isTypeBasedOnly()) {
1816 IntrinsicCostAttributes MaskedGather(
1819 ICA.getFlags());
1820 return getTypeBasedIntrinsicInstrCost(MaskedGather, CostKind);
1821 }
1822 Align Alignment;
1823 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1824 Alignment = VPI->getPointerAlignment().valueOrOne();
1825 bool VarMask = isa<Constant>(ICA.getArgs()[1]);
1826 return thisT()->getGatherScatterOpCost(
1827 Instruction::Load, ICA.getReturnType(), ICA.getArgs()[0], VarMask,
1828 Alignment, CostKind, nullptr);
1829 }
1830
1831 if (ICA.getID() == Intrinsic::vp_select ||
1832 ICA.getID() == Intrinsic::vp_merge) {
1833 TTI::OperandValueInfo OpInfoX, OpInfoY;
1834 if (!ICA.isTypeBasedOnly()) {
1835 OpInfoX = TTI::getOperandInfo(ICA.getArgs()[0]);
1836 OpInfoY = TTI::getOperandInfo(ICA.getArgs()[1]);
1837 }
1838 return getCmpSelInstrCost(
1839 Instruction::Select, ICA.getReturnType(), ICA.getArgTypes()[0],
1840 CmpInst::BAD_ICMP_PREDICATE, CostKind, OpInfoX, OpInfoY);
1841 }
1842
1843 std::optional<Intrinsic::ID> FID =
1845
1846 // Not functionally equivalent but close enough for cost modelling.
1847 if (ICA.getID() == Intrinsic::experimental_vp_reverse)
1848 FID = Intrinsic::vector_reverse;
1849
1850 if (FID) {
1851 // Non-vp version will have same arg types except mask and vector
1852 // length.
1853 assert(ICA.getArgTypes().size() >= 2 &&
1854 "Expected VPIntrinsic to have Mask and Vector Length args and "
1855 "types");
1856
1857 ArrayRef<const Value *> NewArgs = ArrayRef(ICA.getArgs());
1858 if (!ICA.isTypeBasedOnly())
1859 NewArgs = NewArgs.drop_back(2);
1861
1862 // VPReduction intrinsics have a start value argument that their non-vp
1863 // counterparts do not have, except for the fadd and fmul non-vp
1864 // counterpart.
1866 *FID != Intrinsic::vector_reduce_fadd &&
1867 *FID != Intrinsic::vector_reduce_fmul) {
1868 if (!ICA.isTypeBasedOnly())
1869 NewArgs = NewArgs.drop_front();
1870 NewTys = NewTys.drop_front();
1871 }
1872
1873 IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewArgs,
1874 NewTys, ICA.getFlags());
1875 return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
1876 }
1877 }
1878
1879 if (ICA.isTypeBasedOnly())
1881
1882 Type *RetTy = ICA.getReturnType();
1883
1886
1887 const IntrinsicInst *I = ICA.getInst();
1888 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1889 FastMathFlags FMF = ICA.getFlags();
1890 switch (IID) {
1891 default:
1892 break;
1893
1894 case Intrinsic::powi:
1895 if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1896 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1897 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1898 ShouldOptForSize)) {
1899 // The cost is modeled on the expansion performed by ExpandPowI in
1900 // SelectionDAGBuilder.
1901 APInt Exponent = RHSC->getValue().abs();
1902 unsigned ActiveBits = Exponent.getActiveBits();
1903 unsigned PopCount = Exponent.popcount();
1904 InstructionCost Cost = (ActiveBits + PopCount - 2) *
1905 thisT()->getArithmeticInstrCost(
1906 Instruction::FMul, RetTy, CostKind);
1907 if (RHSC->isNegative())
1908 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1909 CostKind);
1910 return Cost;
1911 }
1912 }
1913 break;
1914 case Intrinsic::cttz:
1915 // FIXME: If necessary, this should go in target-specific overrides.
1916 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1918 break;
1919
1920 case Intrinsic::ctlz:
1921 // FIXME: If necessary, this should go in target-specific overrides.
1922 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1924 break;
1925
1926 case Intrinsic::memcpy:
1927 return thisT()->getMemcpyCost(ICA.getInst());
1928
1929 case Intrinsic::masked_scatter: {
1930 const Value *Mask = Args[3];
1931 bool VarMask = !isa<Constant>(Mask);
1932 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1933 return thisT()->getGatherScatterOpCost(Instruction::Store,
1934 ICA.getArgTypes()[0], Args[1],
1935 VarMask, Alignment, CostKind, I);
1936 }
1937 case Intrinsic::masked_gather: {
1938 const Value *Mask = Args[2];
1939 bool VarMask = !isa<Constant>(Mask);
1940 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1941 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1942 VarMask, Alignment, CostKind, I);
1943 }
1944 case Intrinsic::masked_compressstore: {
1945 const Value *Data = Args[0];
1946 const Value *Mask = Args[2];
1947 Align Alignment = I->getParamAlign(1).valueOrOne();
1948 return thisT()->getExpandCompressMemoryOpCost(
1949 Instruction::Store, Data->getType(), !isa<Constant>(Mask), Alignment,
1950 CostKind, I);
1951 }
1952 case Intrinsic::masked_expandload: {
1953 const Value *Mask = Args[1];
1954 Align Alignment = I->getParamAlign(0).valueOrOne();
1955 return thisT()->getExpandCompressMemoryOpCost(Instruction::Load, RetTy,
1956 !isa<Constant>(Mask),
1957 Alignment, CostKind, I);
1958 }
1959 case Intrinsic::experimental_vp_strided_store: {
1960 const Value *Data = Args[0];
1961 const Value *Ptr = Args[1];
1962 const Value *Mask = Args[3];
1963 const Value *EVL = Args[4];
1964 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1965 Type *EltTy = cast<VectorType>(Data->getType())->getElementType();
1966 Align Alignment =
1967 I->getParamAlign(1).value_or(thisT()->DL.getABITypeAlign(EltTy));
1968 return thisT()->getStridedMemoryOpCost(Instruction::Store,
1969 Data->getType(), Ptr, VarMask,
1970 Alignment, CostKind, I);
1971 }
1972 case Intrinsic::experimental_vp_strided_load: {
1973 const Value *Ptr = Args[0];
1974 const Value *Mask = Args[2];
1975 const Value *EVL = Args[3];
1976 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1977 Type *EltTy = cast<VectorType>(RetTy)->getElementType();
1978 Align Alignment =
1979 I->getParamAlign(0).value_or(thisT()->DL.getABITypeAlign(EltTy));
1980 return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr,
1981 VarMask, Alignment, CostKind, I);
1982 }
1983 case Intrinsic::stepvector: {
1984 if (isa<ScalableVectorType>(RetTy))
1986 // The cost of materialising a constant integer vector.
1988 }
1989 case Intrinsic::vector_extract: {
1990 // FIXME: Handle case where a scalable vector is extracted from a scalable
1991 // vector
1992 if (isa<ScalableVectorType>(RetTy))
1994 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1995 return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1996 cast<VectorType>(RetTy),
1997 cast<VectorType>(Args[0]->getType()), {},
1998 CostKind, Index, cast<VectorType>(RetTy));
1999 }
2000 case Intrinsic::vector_insert: {
2001 // FIXME: Handle case where a scalable vector is inserted into a scalable
2002 // vector
2003 if (isa<ScalableVectorType>(Args[1]->getType()))
2005 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
2006 return thisT()->getShuffleCost(
2007 TTI::SK_InsertSubvector, cast<VectorType>(RetTy),
2008 cast<VectorType>(Args[0]->getType()), {}, CostKind, Index,
2009 cast<VectorType>(Args[1]->getType()));
2010 }
2011 case Intrinsic::vector_splice: {
2012 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
2013 return thisT()->getShuffleCost(TTI::SK_Splice, cast<VectorType>(RetTy),
2014 cast<VectorType>(Args[0]->getType()), {},
2015 CostKind, Index, cast<VectorType>(RetTy));
2016 }
2017 case Intrinsic::vector_reduce_add:
2018 case Intrinsic::vector_reduce_mul:
2019 case Intrinsic::vector_reduce_and:
2020 case Intrinsic::vector_reduce_or:
2021 case Intrinsic::vector_reduce_xor:
2022 case Intrinsic::vector_reduce_smax:
2023 case Intrinsic::vector_reduce_smin:
2024 case Intrinsic::vector_reduce_fmax:
2025 case Intrinsic::vector_reduce_fmin:
2026 case Intrinsic::vector_reduce_fmaximum:
2027 case Intrinsic::vector_reduce_fminimum:
2028 case Intrinsic::vector_reduce_umax:
2029 case Intrinsic::vector_reduce_umin: {
2030 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
2032 }
2033 case Intrinsic::vector_reduce_fadd:
2034 case Intrinsic::vector_reduce_fmul: {
2036 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
2038 }
2039 case Intrinsic::fshl:
2040 case Intrinsic::fshr: {
2041 const Value *X = Args[0];
2042 const Value *Y = Args[1];
2043 const Value *Z = Args[2];
2046 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z);
2047
2048 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
2049 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
2051 Cost +=
2052 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
2053 Cost +=
2054 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
2055 Cost += thisT()->getArithmeticInstrCost(
2056 BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
2057 {OpInfoZ.Kind, TTI::OP_None});
2058 Cost += thisT()->getArithmeticInstrCost(
2059 BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
2060 {OpInfoZ.Kind, TTI::OP_None});
2061 // Non-constant shift amounts requires a modulo. If the typesize is a
2062 // power-2 then this will be converted to an and, otherwise it will use a
2063 // urem.
2064 if (!OpInfoZ.isConstant())
2065 Cost += thisT()->getArithmeticInstrCost(
2066 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? BinaryOperator::And
2067 : BinaryOperator::URem,
2068 RetTy, CostKind, OpInfoZ,
2069 {TTI::OK_UniformConstantValue, TTI::OP_None});
2070 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
2071 if (X != Y) {
2072 Type *CondTy = RetTy->getWithNewBitWidth(1);
2073 Cost +=
2074 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2076 Cost +=
2077 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2079 }
2080 return Cost;
2081 }
2082 case Intrinsic::experimental_cttz_elts: {
2083 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
2084
2085 // If we're not expanding the intrinsic then we assume this is cheap
2086 // to implement.
2087 if (!getTLI()->shouldExpandCttzElements(ArgType))
2088 return getTypeLegalizationCost(RetTy).first;
2089
2090 // TODO: The costs below reflect the expansion code in
2091 // SelectionDAGBuilder, but we may want to sacrifice some accuracy in
2092 // favour of compile time.
2093
2094 // Find the smallest "sensible" element type to use for the expansion.
2095 bool ZeroIsPoison = !cast<ConstantInt>(Args[1])->isZero();
2096 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
2097 if (isa<ScalableVectorType>(ICA.getArgTypes()[0]) && I && I->getCaller())
2098 VScaleRange = getVScaleRange(I->getCaller(), 64);
2099
2100 unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
2101 RetTy, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
2102 Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth);
2103
2104 // Create the new vector type & get the vector length
2105 Type *NewVecTy = VectorType::get(
2106 NewEltTy, cast<VectorType>(Args[0]->getType())->getElementCount());
2107
2108 IntrinsicCostAttributes StepVecAttrs(Intrinsic::stepvector, NewVecTy, {},
2109 FMF);
2111 thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
2112
2113 Cost +=
2114 thisT()->getArithmeticInstrCost(Instruction::Sub, NewVecTy, CostKind);
2115 Cost += thisT()->getCastInstrCost(Instruction::SExt, NewVecTy,
2116 Args[0]->getType(),
2118 Cost +=
2119 thisT()->getArithmeticInstrCost(Instruction::And, NewVecTy, CostKind);
2120
2121 IntrinsicCostAttributes ReducAttrs(Intrinsic::vector_reduce_umax,
2122 NewEltTy, NewVecTy, FMF, I, 1);
2123 Cost += thisT()->getTypeBasedIntrinsicInstrCost(ReducAttrs, CostKind);
2124 Cost +=
2125 thisT()->getArithmeticInstrCost(Instruction::Sub, NewEltTy, CostKind);
2126
2127 return Cost;
2128 }
2129 case Intrinsic::get_active_lane_mask:
2130 case Intrinsic::experimental_vector_match:
2131 case Intrinsic::experimental_vector_histogram_add:
2132 case Intrinsic::experimental_vector_histogram_uadd_sat:
2133 case Intrinsic::experimental_vector_histogram_umax:
2134 case Intrinsic::experimental_vector_histogram_umin:
2135 return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind);
2136 case Intrinsic::modf:
2137 case Intrinsic::sincos:
2138 case Intrinsic::sincospi: {
2139 Type *Ty = getContainedTypes(RetTy).front();
2140 EVT VT = getTLI()->getValueType(DL, Ty);
2141
2142 RTLIB::Libcall LC = [&] {
2143 switch (ICA.getID()) {
2144 case Intrinsic::modf:
2145 return RTLIB::getMODF;
2146 case Intrinsic::sincos:
2147 return RTLIB::getSINCOS;
2148 case Intrinsic::sincospi:
2149 return RTLIB::getSINCOSPI;
2150 default:
2151 llvm_unreachable("unexpected intrinsic");
2152 }
2153 }()(VT.getScalarType());
2154
2155 std::optional<unsigned> CallRetElementIndex;
2156 // The first element of the modf result is returned by value in the
2157 // libcall.
2158 if (ICA.getID() == Intrinsic::modf)
2159 CallRetElementIndex = 0;
2160
2161 if (auto Cost = getMultipleResultIntrinsicVectorLibCallCost(
2162 ICA, CostKind, LC, CallRetElementIndex))
2163 return *Cost;
2164 // Otherwise, fallback to default scalarization cost.
2165 break;
2166 }
2167 }
2168
2169 // Assume that we need to scalarize this intrinsic.)
2170 // Compute the scalarization overhead based on Args for a vector
2171 // intrinsic.
2172 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
2173 if (RetVF.isVector() && !RetVF.isScalable()) {
2174 ScalarizationCost = 0;
2175 if (!RetTy->isVoidTy()) {
2176 for (Type *VectorTy : getContainedTypes(RetTy)) {
2177 ScalarizationCost += getScalarizationOverhead(
2178 cast<VectorType>(VectorTy),
2179 /*Insert=*/true, /*Extract=*/false, CostKind);
2180 }
2181 }
2182 ScalarizationCost += getOperandsScalarizationOverhead(
2183 filterConstantAndDuplicatedOperands(Args, ICA.getArgTypes()),
2184 CostKind);
2185 }
2186
2187 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
2188 ScalarizationCost);
2189 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
2190 }
2191
2192 /// Get intrinsic cost based on argument types.
2193 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
2194 /// cost of scalarizing the arguments and the return value will be computed
2195 /// based on types.
2199 Intrinsic::ID IID = ICA.getID();
2200 Type *RetTy = ICA.getReturnType();
2201 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
2202 FastMathFlags FMF = ICA.getFlags();
2203 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
2204 bool SkipScalarizationCost = ICA.skipScalarizationCost();
2205
2206 VectorType *VecOpTy = nullptr;
2207 if (!Tys.empty()) {
2208 // The vector reduction operand is operand 0 except for fadd/fmul.
2209 // Their operand 0 is a scalar start value, so the vector op is operand 1.
2210 unsigned VecTyIndex = 0;
2211 if (IID == Intrinsic::vector_reduce_fadd ||
2212 IID == Intrinsic::vector_reduce_fmul)
2213 VecTyIndex = 1;
2214 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
2215 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
2216 }
2217
2218 // Library call cost - other than size, make it expensive.
2219 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
2220 unsigned ISD = 0;
2221 switch (IID) {
2222 default: {
2223 // Scalable vectors cannot be scalarized, so return Invalid.
2224 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2225 return isa<ScalableVectorType>(Ty);
2226 }))
2228
2229 // Assume that we need to scalarize this intrinsic.
2230 InstructionCost ScalarizationCost =
2231 SkipScalarizationCost ? ScalarizationCostPassed : 0;
2232 unsigned ScalarCalls = 1;
2233 Type *ScalarRetTy = RetTy;
2234 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2235 if (!SkipScalarizationCost)
2236 ScalarizationCost = getScalarizationOverhead(
2237 RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
2238 ScalarCalls = std::max(ScalarCalls,
2239 cast<FixedVectorType>(RetVTy)->getNumElements());
2240 ScalarRetTy = RetTy->getScalarType();
2241 }
2242 SmallVector<Type *, 4> ScalarTys;
2243 for (Type *Ty : Tys) {
2244 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
2245 if (!SkipScalarizationCost)
2246 ScalarizationCost += getScalarizationOverhead(
2247 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2248 ScalarCalls = std::max(ScalarCalls,
2249 cast<FixedVectorType>(VTy)->getNumElements());
2250 Ty = Ty->getScalarType();
2251 }
2252 ScalarTys.push_back(Ty);
2253 }
2254 if (ScalarCalls == 1)
2255 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
2256
2257 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
2258 InstructionCost ScalarCost =
2259 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
2260
2261 return ScalarCalls * ScalarCost + ScalarizationCost;
2262 }
2263 // Look for intrinsics that can be lowered directly or turned into a scalar
2264 // intrinsic call.
2265 case Intrinsic::sqrt:
2266 ISD = ISD::FSQRT;
2267 break;
2268 case Intrinsic::sin:
2269 ISD = ISD::FSIN;
2270 break;
2271 case Intrinsic::cos:
2272 ISD = ISD::FCOS;
2273 break;
2274 case Intrinsic::sincos:
2275 ISD = ISD::FSINCOS;
2276 break;
2277 case Intrinsic::sincospi:
2278 ISD = ISD::FSINCOSPI;
2279 break;
2280 case Intrinsic::modf:
2281 ISD = ISD::FMODF;
2282 break;
2283 case Intrinsic::tan:
2284 ISD = ISD::FTAN;
2285 break;
2286 case Intrinsic::asin:
2287 ISD = ISD::FASIN;
2288 break;
2289 case Intrinsic::acos:
2290 ISD = ISD::FACOS;
2291 break;
2292 case Intrinsic::atan:
2293 ISD = ISD::FATAN;
2294 break;
2295 case Intrinsic::atan2:
2296 ISD = ISD::FATAN2;
2297 break;
2298 case Intrinsic::sinh:
2299 ISD = ISD::FSINH;
2300 break;
2301 case Intrinsic::cosh:
2302 ISD = ISD::FCOSH;
2303 break;
2304 case Intrinsic::tanh:
2305 ISD = ISD::FTANH;
2306 break;
2307 case Intrinsic::exp:
2308 ISD = ISD::FEXP;
2309 break;
2310 case Intrinsic::exp2:
2311 ISD = ISD::FEXP2;
2312 break;
2313 case Intrinsic::exp10:
2314 ISD = ISD::FEXP10;
2315 break;
2316 case Intrinsic::log:
2317 ISD = ISD::FLOG;
2318 break;
2319 case Intrinsic::log10:
2320 ISD = ISD::FLOG10;
2321 break;
2322 case Intrinsic::log2:
2323 ISD = ISD::FLOG2;
2324 break;
2325 case Intrinsic::ldexp:
2326 ISD = ISD::FLDEXP;
2327 break;
2328 case Intrinsic::fabs:
2329 ISD = ISD::FABS;
2330 break;
2331 case Intrinsic::canonicalize:
2332 ISD = ISD::FCANONICALIZE;
2333 break;
2334 case Intrinsic::minnum:
2335 ISD = ISD::FMINNUM;
2336 break;
2337 case Intrinsic::maxnum:
2338 ISD = ISD::FMAXNUM;
2339 break;
2340 case Intrinsic::minimum:
2341 ISD = ISD::FMINIMUM;
2342 break;
2343 case Intrinsic::maximum:
2344 ISD = ISD::FMAXIMUM;
2345 break;
2346 case Intrinsic::minimumnum:
2347 ISD = ISD::FMINIMUMNUM;
2348 break;
2349 case Intrinsic::maximumnum:
2350 ISD = ISD::FMAXIMUMNUM;
2351 break;
2352 case Intrinsic::copysign:
2353 ISD = ISD::FCOPYSIGN;
2354 break;
2355 case Intrinsic::floor:
2356 ISD = ISD::FFLOOR;
2357 break;
2358 case Intrinsic::ceil:
2359 ISD = ISD::FCEIL;
2360 break;
2361 case Intrinsic::trunc:
2362 ISD = ISD::FTRUNC;
2363 break;
2364 case Intrinsic::nearbyint:
2365 ISD = ISD::FNEARBYINT;
2366 break;
2367 case Intrinsic::rint:
2368 ISD = ISD::FRINT;
2369 break;
2370 case Intrinsic::lrint:
2371 ISD = ISD::LRINT;
2372 break;
2373 case Intrinsic::llrint:
2374 ISD = ISD::LLRINT;
2375 break;
2376 case Intrinsic::round:
2377 ISD = ISD::FROUND;
2378 break;
2379 case Intrinsic::roundeven:
2380 ISD = ISD::FROUNDEVEN;
2381 break;
2382 case Intrinsic::lround:
2383 ISD = ISD::LROUND;
2384 break;
2385 case Intrinsic::llround:
2386 ISD = ISD::LLROUND;
2387 break;
2388 case Intrinsic::pow:
2389 ISD = ISD::FPOW;
2390 break;
2391 case Intrinsic::fma:
2392 ISD = ISD::FMA;
2393 break;
2394 case Intrinsic::fmuladd:
2395 ISD = ISD::FMA;
2396 break;
2397 case Intrinsic::experimental_constrained_fmuladd:
2398 ISD = ISD::STRICT_FMA;
2399 break;
2400 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
2401 case Intrinsic::lifetime_start:
2402 case Intrinsic::lifetime_end:
2403 case Intrinsic::sideeffect:
2404 case Intrinsic::pseudoprobe:
2405 case Intrinsic::arithmetic_fence:
2406 return 0;
2407 case Intrinsic::masked_store: {
2408 Type *Ty = Tys[0];
2409 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2410 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
2411 CostKind);
2412 }
2413 case Intrinsic::masked_load: {
2414 Type *Ty = RetTy;
2415 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2416 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
2417 CostKind);
2418 }
2419 case Intrinsic::experimental_vp_strided_store: {
2420 auto *Ty = cast<VectorType>(ICA.getArgTypes()[0]);
2421 Align Alignment = thisT()->DL.getABITypeAlign(Ty->getElementType());
2422 return thisT()->getStridedMemoryOpCost(
2423 Instruction::Store, Ty, /*Ptr=*/nullptr, /*VariableMask=*/true,
2424 Alignment, CostKind, ICA.getInst());
2425 }
2426 case Intrinsic::experimental_vp_strided_load: {
2427 auto *Ty = cast<VectorType>(ICA.getReturnType());
2428 Align Alignment = thisT()->DL.getABITypeAlign(Ty->getElementType());
2429 return thisT()->getStridedMemoryOpCost(
2430 Instruction::Load, Ty, /*Ptr=*/nullptr, /*VariableMask=*/true,
2431 Alignment, CostKind, ICA.getInst());
2432 }
2433 case Intrinsic::vector_reduce_add:
2434 case Intrinsic::vector_reduce_mul:
2435 case Intrinsic::vector_reduce_and:
2436 case Intrinsic::vector_reduce_or:
2437 case Intrinsic::vector_reduce_xor:
2438 return thisT()->getArithmeticReductionCost(
2439 getArithmeticReductionInstruction(IID), VecOpTy, std::nullopt,
2440 CostKind);
2441 case Intrinsic::vector_reduce_fadd:
2442 case Intrinsic::vector_reduce_fmul:
2443 return thisT()->getArithmeticReductionCost(
2444 getArithmeticReductionInstruction(IID), VecOpTy, FMF, CostKind);
2445 case Intrinsic::vector_reduce_smax:
2446 case Intrinsic::vector_reduce_smin:
2447 case Intrinsic::vector_reduce_umax:
2448 case Intrinsic::vector_reduce_umin:
2449 case Intrinsic::vector_reduce_fmax:
2450 case Intrinsic::vector_reduce_fmin:
2451 case Intrinsic::vector_reduce_fmaximum:
2452 case Intrinsic::vector_reduce_fminimum:
2453 return thisT()->getMinMaxReductionCost(getMinMaxReductionIntrinsicOp(IID),
2454 VecOpTy, ICA.getFlags(), CostKind);
2455 case Intrinsic::experimental_vector_match: {
2456 auto *SearchTy = cast<VectorType>(ICA.getArgTypes()[0]);
2457 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
2458 unsigned SearchSize = NeedleTy->getNumElements();
2459
2460 // If we're not expanding the intrinsic then we assume this is cheap to
2461 // implement.
2462 EVT SearchVT = getTLI()->getValueType(DL, SearchTy);
2463 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize))
2464 return getTypeLegalizationCost(RetTy).first;
2465
2466 // Approximate the cost based on the expansion code in
2467 // SelectionDAGBuilder.
2469 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, NeedleTy,
2470 CostKind, 1, nullptr, nullptr);
2471 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SearchTy,
2472 CostKind, 0, nullptr, nullptr);
2473 Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SearchTy, SearchTy, {},
2474 CostKind, 0, nullptr);
2475 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SearchTy, RetTy,
2477 Cost +=
2478 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
2479 Cost *= SearchSize;
2480 Cost +=
2481 thisT()->getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
2482 return Cost;
2483 }
2484 case Intrinsic::vector_reverse:
2485 return thisT()->getShuffleCost(TTI::SK_Reverse, cast<VectorType>(RetTy),
2486 cast<VectorType>(ICA.getArgTypes()[0]), {},
2487 CostKind, 0, cast<VectorType>(RetTy));
2488 case Intrinsic::experimental_vector_histogram_add:
2489 case Intrinsic::experimental_vector_histogram_uadd_sat:
2490 case Intrinsic::experimental_vector_histogram_umax:
2491 case Intrinsic::experimental_vector_histogram_umin: {
2492 FixedVectorType *PtrsTy = dyn_cast<FixedVectorType>(ICA.getArgTypes()[0]);
2493 Type *EltTy = ICA.getArgTypes()[1];
2494
2495 // Targets with scalable vectors must handle this on their own.
2496 if (!PtrsTy)
2498
2499 Align Alignment = thisT()->DL.getABITypeAlign(EltTy);
2501 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, PtrsTy,
2502 CostKind, 1, nullptr, nullptr);
2503 Cost += thisT()->getMemoryOpCost(Instruction::Load, EltTy, Alignment, 0,
2504 CostKind);
2505 switch (IID) {
2506 default:
2507 llvm_unreachable("Unhandled histogram update operation.");
2508 case Intrinsic::experimental_vector_histogram_add:
2509 Cost +=
2510 thisT()->getArithmeticInstrCost(Instruction::Add, EltTy, CostKind);
2511 break;
2512 case Intrinsic::experimental_vector_histogram_uadd_sat: {
2513 IntrinsicCostAttributes UAddSat(Intrinsic::uadd_sat, EltTy, {EltTy});
2514 Cost += thisT()->getIntrinsicInstrCost(UAddSat, CostKind);
2515 break;
2516 }
2517 case Intrinsic::experimental_vector_histogram_umax: {
2518 IntrinsicCostAttributes UMax(Intrinsic::umax, EltTy, {EltTy});
2519 Cost += thisT()->getIntrinsicInstrCost(UMax, CostKind);
2520 break;
2521 }
2522 case Intrinsic::experimental_vector_histogram_umin: {
2523 IntrinsicCostAttributes UMin(Intrinsic::umin, EltTy, {EltTy});
2524 Cost += thisT()->getIntrinsicInstrCost(UMin, CostKind);
2525 break;
2526 }
2527 }
2528 Cost += thisT()->getMemoryOpCost(Instruction::Store, EltTy, Alignment, 0,
2529 CostKind);
2530 Cost *= PtrsTy->getNumElements();
2531 return Cost;
2532 }
2533 case Intrinsic::get_active_lane_mask: {
2534 Type *ArgTy = ICA.getArgTypes()[0];
2535 EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
2536 EVT ArgVT = getTLI()->getValueType(DL, ArgTy, true);
2537
2538 // If we're not expanding the intrinsic then we assume this is cheap
2539 // to implement.
2540 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgVT))
2541 return getTypeLegalizationCost(RetTy).first;
2542
2543 // Create the expanded types that will be used to calculate the uadd_sat
2544 // operation.
2545 Type *ExpRetTy =
2546 VectorType::get(ArgTy, cast<VectorType>(RetTy)->getElementCount());
2547 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
2549 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
2550 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
2552 return Cost;
2553 }
2554 case Intrinsic::experimental_memset_pattern:
2555 // This cost is set to match the cost of the memset_pattern16 libcall.
2556 // It should likely be re-evaluated after migration to this intrinsic
2557 // is complete.
2558 return TTI::TCC_Basic * 4;
2559 case Intrinsic::abs:
2560 ISD = ISD::ABS;
2561 break;
2562 case Intrinsic::fshl:
2563 ISD = ISD::FSHL;
2564 break;
2565 case Intrinsic::fshr:
2566 ISD = ISD::FSHR;
2567 break;
2568 case Intrinsic::smax:
2569 ISD = ISD::SMAX;
2570 break;
2571 case Intrinsic::smin:
2572 ISD = ISD::SMIN;
2573 break;
2574 case Intrinsic::umax:
2575 ISD = ISD::UMAX;
2576 break;
2577 case Intrinsic::umin:
2578 ISD = ISD::UMIN;
2579 break;
2580 case Intrinsic::sadd_sat:
2581 ISD = ISD::SADDSAT;
2582 break;
2583 case Intrinsic::ssub_sat:
2584 ISD = ISD::SSUBSAT;
2585 break;
2586 case Intrinsic::uadd_sat:
2587 ISD = ISD::UADDSAT;
2588 break;
2589 case Intrinsic::usub_sat:
2590 ISD = ISD::USUBSAT;
2591 break;
2592 case Intrinsic::smul_fix:
2593 ISD = ISD::SMULFIX;
2594 break;
2595 case Intrinsic::umul_fix:
2596 ISD = ISD::UMULFIX;
2597 break;
2598 case Intrinsic::sadd_with_overflow:
2599 ISD = ISD::SADDO;
2600 break;
2601 case Intrinsic::ssub_with_overflow:
2602 ISD = ISD::SSUBO;
2603 break;
2604 case Intrinsic::uadd_with_overflow:
2605 ISD = ISD::UADDO;
2606 break;
2607 case Intrinsic::usub_with_overflow:
2608 ISD = ISD::USUBO;
2609 break;
2610 case Intrinsic::smul_with_overflow:
2611 ISD = ISD::SMULO;
2612 break;
2613 case Intrinsic::umul_with_overflow:
2614 ISD = ISD::UMULO;
2615 break;
2616 case Intrinsic::fptosi_sat:
2617 case Intrinsic::fptoui_sat: {
2618 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Tys[0]);
2619 std::pair<InstructionCost, MVT> RetLT = getTypeLegalizationCost(RetTy);
2620
2621 // For cast instructions, types are different between source and
2622 // destination. Also need to check if the source type can be legalize.
2623 if (!SrcLT.first.isValid() || !RetLT.first.isValid())
2625 ISD = IID == Intrinsic::fptosi_sat ? ISD::FP_TO_SINT_SAT
2627 break;
2628 }
2629 case Intrinsic::ctpop:
2630 ISD = ISD::CTPOP;
2631 // In case of legalization use TCC_Expensive. This is cheaper than a
2632 // library call but still not a cheap instruction.
2633 SingleCallCost = TargetTransformInfo::TCC_Expensive;
2634 break;
2635 case Intrinsic::ctlz:
2636 ISD = ISD::CTLZ;
2637 break;
2638 case Intrinsic::cttz:
2639 ISD = ISD::CTTZ;
2640 break;
2641 case Intrinsic::bswap:
2642 ISD = ISD::BSWAP;
2643 break;
2644 case Intrinsic::bitreverse:
2645 ISD = ISD::BITREVERSE;
2646 break;
2647 case Intrinsic::ucmp:
2648 ISD = ISD::UCMP;
2649 break;
2650 case Intrinsic::scmp:
2651 ISD = ISD::SCMP;
2652 break;
2653 }
2654
2655 auto *ST = dyn_cast<StructType>(RetTy);
2656 Type *LegalizeTy = ST ? ST->getContainedType(0) : RetTy;
2657 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(LegalizeTy);
2658
2659 const TargetLoweringBase *TLI = getTLI();
2660
2661 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2662 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2663 TLI->isFAbsFree(LT.second)) {
2664 return 0;
2665 }
2666
2667 // The operation is legal. Assume it costs 1.
2668 // If the type is split to multiple registers, assume that there is some
2669 // overhead to this.
2670 // TODO: Once we have extract/insert subvector cost we need to use them.
2671 if (LT.first > 1)
2672 return (LT.first * 2);
2673 else
2674 return (LT.first * 1);
2675 } else if (TLI->isOperationCustom(ISD, LT.second)) {
2676 // If the operation is custom lowered then assume
2677 // that the code is twice as expensive.
2678 return (LT.first * 2);
2679 }
2680
2681 switch (IID) {
2682 case Intrinsic::fmuladd: {
2683 // If we can't lower fmuladd into an FMA estimate the cost as a floating
2684 // point mul followed by an add.
2685
2686 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2687 CostKind) +
2688 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2689 CostKind);
2690 }
2691 case Intrinsic::experimental_constrained_fmuladd: {
2692 IntrinsicCostAttributes FMulAttrs(
2693 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2694 IntrinsicCostAttributes FAddAttrs(
2695 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2696 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2697 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2698 }
2699 case Intrinsic::smin:
2700 case Intrinsic::smax:
2701 case Intrinsic::umin:
2702 case Intrinsic::umax: {
2703 // minmax(X,Y) = select(icmp(X,Y),X,Y)
2704 Type *CondTy = RetTy->getWithNewBitWidth(1);
2705 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
2706 CmpInst::Predicate Pred =
2707 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
2709 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2710 Pred, CostKind);
2711 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2712 Pred, CostKind);
2713 return Cost;
2714 }
2715 case Intrinsic::sadd_with_overflow:
2716 case Intrinsic::ssub_with_overflow: {
2717 Type *SumTy = RetTy->getContainedType(0);
2718 Type *OverflowTy = RetTy->getContainedType(1);
2719 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
2720 ? BinaryOperator::Add
2721 : BinaryOperator::Sub;
2722
2723 // Add:
2724 // Overflow -> (Result < LHS) ^ (RHS < 0)
2725 // Sub:
2726 // Overflow -> (Result < LHS) ^ (RHS > 0)
2728 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2729 Cost +=
2730 2 * thisT()->getCmpSelInstrCost(Instruction::ICmp, SumTy, OverflowTy,
2732 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2733 CostKind);
2734 return Cost;
2735 }
2736 case Intrinsic::uadd_with_overflow:
2737 case Intrinsic::usub_with_overflow: {
2738 Type *SumTy = RetTy->getContainedType(0);
2739 Type *OverflowTy = RetTy->getContainedType(1);
2740 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2741 ? BinaryOperator::Add
2742 : BinaryOperator::Sub;
2743 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2746
2748 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2749 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
2750 OverflowTy, Pred, CostKind);
2751 return Cost;
2752 }
2753 case Intrinsic::smul_with_overflow:
2754 case Intrinsic::umul_with_overflow: {
2755 Type *MulTy = RetTy->getContainedType(0);
2756 Type *OverflowTy = RetTy->getContainedType(1);
2757 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2758 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
2759 bool IsSigned = IID == Intrinsic::smul_with_overflow;
2760
2761 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2763
2765 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2766 Cost +=
2767 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2768 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2769 CCH, CostKind);
2770 Cost += thisT()->getArithmeticInstrCost(
2771 Instruction::LShr, ExtTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2773
2774 if (IsSigned)
2775 Cost += thisT()->getArithmeticInstrCost(
2776 Instruction::AShr, MulTy, CostKind,
2779
2780 Cost += thisT()->getCmpSelInstrCost(
2781 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2782 return Cost;
2783 }
2784 case Intrinsic::sadd_sat:
2785 case Intrinsic::ssub_sat: {
2786 // Assume a default expansion.
2787 Type *CondTy = RetTy->getWithNewBitWidth(1);
2788
2789 Type *OpTy = StructType::create({RetTy, CondTy});
2790 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
2791 ? Intrinsic::sadd_with_overflow
2792 : Intrinsic::ssub_with_overflow;
2794
2795 // SatMax -> Overflow && SumDiff < 0
2796 // SatMin -> Overflow && SumDiff >= 0
2798 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2799 nullptr, ScalarizationCostPassed);
2800 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2801 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2802 Pred, CostKind);
2803 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
2804 CondTy, Pred, CostKind);
2805 return Cost;
2806 }
2807 case Intrinsic::uadd_sat:
2808 case Intrinsic::usub_sat: {
2809 Type *CondTy = RetTy->getWithNewBitWidth(1);
2810
2811 Type *OpTy = StructType::create({RetTy, CondTy});
2812 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
2813 ? Intrinsic::uadd_with_overflow
2814 : Intrinsic::usub_with_overflow;
2815
2817 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2818 nullptr, ScalarizationCostPassed);
2819 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2820 Cost +=
2821 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2823 return Cost;
2824 }
2825 case Intrinsic::smul_fix:
2826 case Intrinsic::umul_fix: {
2827 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
2828 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
2829
2830 unsigned ExtOp =
2831 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
2833
2835 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
2836 Cost +=
2837 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2838 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
2839 CCH, CostKind);
2840 Cost += thisT()->getArithmeticInstrCost(
2841 Instruction::LShr, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2843 Cost += thisT()->getArithmeticInstrCost(
2844 Instruction::Shl, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2846 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
2847 return Cost;
2848 }
2849 case Intrinsic::abs: {
2850 // abs(X) = select(icmp(X,0),X,sub(0,X))
2851 Type *CondTy = RetTy->getWithNewBitWidth(1);
2854 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2855 Pred, CostKind);
2856 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2857 Pred, CostKind);
2858 // TODO: Should we add an OperandValueProperties::OP_Zero property?
2859 Cost += thisT()->getArithmeticInstrCost(
2860 BinaryOperator::Sub, RetTy, CostKind,
2862 return Cost;
2863 }
2864 case Intrinsic::fshl:
2865 case Intrinsic::fshr: {
2866 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
2867 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
2868 Type *CondTy = RetTy->getWithNewBitWidth(1);
2870 Cost +=
2871 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
2872 Cost +=
2873 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
2874 Cost +=
2875 thisT()->getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
2876 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::LShr, RetTy,
2877 CostKind);
2878 // Non-constant shift amounts requires a modulo. If the typesize is a
2879 // power-2 then this will be converted to an and, otherwise it will use a
2880 // urem.
2881 Cost += thisT()->getArithmeticInstrCost(
2882 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? BinaryOperator::And
2883 : BinaryOperator::URem,
2884 RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2885 {TTI::OK_UniformConstantValue, TTI::OP_None});
2886 // Shift-by-zero handling.
2887 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2889 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2891 return Cost;
2892 }
2893 case Intrinsic::fptosi_sat:
2894 case Intrinsic::fptoui_sat: {
2895 if (Tys.empty())
2896 break;
2897 Type *FromTy = Tys[0];
2898 bool IsSigned = IID == Intrinsic::fptosi_sat;
2899
2901 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2902 {FromTy, FromTy});
2903 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2904 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2905 {FromTy, FromTy});
2906 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2907 Cost += thisT()->getCastInstrCost(
2908 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2910 if (IsSigned) {
2911 Type *CondTy = RetTy->getWithNewBitWidth(1);
2912 Cost += thisT()->getCmpSelInstrCost(
2913 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2914 Cost += thisT()->getCmpSelInstrCost(
2915 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2916 }
2917 return Cost;
2918 }
2919 case Intrinsic::ucmp:
2920 case Intrinsic::scmp: {
2921 Type *CmpTy = Tys[0];
2922 Type *CondTy = RetTy->getWithNewBitWidth(1);
2924 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CmpTy, CondTy,
2926 CostKind) +
2927 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CmpTy, CondTy,
2929 CostKind);
2930
2931 EVT VT = TLI->getValueType(DL, CmpTy, true);
2932 if (TLI->shouldExpandCmpUsingSelects(VT)) {
2933 // x < y ? -1 : (x > y ? 1 : 0)
2934 Cost += 2 * thisT()->getCmpSelInstrCost(
2935 BinaryOperator::Select, RetTy, CondTy,
2937 } else {
2938 // zext(x > y) - zext(x < y)
2939 Cost +=
2940 2 * thisT()->getCastInstrCost(CastInst::ZExt, RetTy, CondTy,
2942 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy,
2943 CostKind);
2944 }
2945 return Cost;
2946 }
2947 case Intrinsic::maximumnum:
2948 case Intrinsic::minimumnum: {
2949 // On platform that support FMAXNUM_IEEE/FMINNUM_IEEE, we expand
2950 // maximumnum/minimumnum to
2951 // ARG0 = fcanonicalize ARG0, ARG0 // to quiet ARG0
2952 // ARG1 = fcanonicalize ARG1, ARG1 // to quiet ARG1
2953 // RESULT = MAXNUM_IEEE ARG0, ARG1 // or MINNUM_IEEE
2954 // FIXME: In LangRef, we claimed FMAXNUM has the same behaviour of
2955 // FMAXNUM_IEEE, while the backend hasn't migrated the code yet.
2956 // Finally, we will remove FMAXNUM_IEEE and FMINNUM_IEEE.
2957 int IeeeISD =
2958 IID == Intrinsic::maximumnum ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE;
2959 if (TLI->isOperationLegal(IeeeISD, LT.second)) {
2960 IntrinsicCostAttributes FCanonicalizeAttrs(Intrinsic::canonicalize,
2961 RetTy, Tys[0]);
2962 InstructionCost FCanonicalizeCost =
2963 thisT()->getIntrinsicInstrCost(FCanonicalizeAttrs, CostKind);
2964 return LT.first + FCanonicalizeCost * 2;
2965 }
2966 break;
2967 }
2968 default:
2969 break;
2970 }
2971
2972 // Else, assume that we need to scalarize this intrinsic. For math builtins
2973 // this will emit a costly libcall, adding call overhead and spills. Make it
2974 // very expensive.
2975 if (isVectorizedTy(RetTy)) {
2977
2978 // Scalable vectors cannot be scalarized, so return Invalid.
2979 if (any_of(concat<Type *const>(RetVTys, Tys),
2980 [](Type *Ty) { return isa<ScalableVectorType>(Ty); }))
2982
2983 InstructionCost ScalarizationCost = ScalarizationCostPassed;
2984 if (!SkipScalarizationCost) {
2985 ScalarizationCost = 0;
2986 for (Type *RetVTy : RetVTys) {
2987 ScalarizationCost += getScalarizationOverhead(
2988 cast<VectorType>(RetVTy), /*Insert=*/true,
2989 /*Extract=*/false, CostKind);
2990 }
2991 }
2992
2993 unsigned ScalarCalls = getVectorizedTypeVF(RetTy).getFixedValue();
2994 SmallVector<Type *, 4> ScalarTys;
2995 for (Type *Ty : Tys) {
2996 if (Ty->isVectorTy())
2997 Ty = Ty->getScalarType();
2998 ScalarTys.push_back(Ty);
2999 }
3000 IntrinsicCostAttributes Attrs(IID, toScalarizedTy(RetTy), ScalarTys, FMF);
3001 InstructionCost ScalarCost =
3002 thisT()->getIntrinsicInstrCost(Attrs, CostKind);
3003 for (Type *Ty : Tys) {
3004 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
3005 if (!ICA.skipScalarizationCost())
3006 ScalarizationCost += getScalarizationOverhead(
3007 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
3008 ScalarCalls = std::max(ScalarCalls,
3009 cast<FixedVectorType>(VTy)->getNumElements());
3010 }
3011 }
3012 return ScalarCalls * ScalarCost + ScalarizationCost;
3013 }
3014
3015 // This is going to be turned into a library call, make it expensive.
3016 return SingleCallCost;
3017 }
3018
3019 /// Compute a cost of the given call instruction.
3020 ///
3021 /// Compute the cost of calling function F with return type RetTy and
3022 /// argument types Tys. F might be nullptr, in this case the cost of an
3023 /// arbitrary call with the specified signature will be returned.
3024 /// This is used, for instance, when we estimate call of a vector
3025 /// counterpart of the given function.
3026 /// \param F Called function, might be nullptr.
3027 /// \param RetTy Return value types.
3028 /// \param Tys Argument types.
3029 /// \returns The cost of Call instruction.
3032 TTI::TargetCostKind CostKind) const override {
3033 return 10;
3034 }
3035
3036 unsigned getNumberOfParts(Type *Tp) const override {
3037 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3038 if (!LT.first.isValid())
3039 return 0;
3040 // Try to find actual number of parts for non-power-of-2 elements as
3041 // ceil(num-of-elements/num-of-subtype-elements).
3042 if (auto *FTp = dyn_cast<FixedVectorType>(Tp);
3043 Tp && LT.second.isFixedLengthVector() &&
3044 !has_single_bit(FTp->getNumElements())) {
3045 if (auto *SubTp = dyn_cast_if_present<FixedVectorType>(
3046 EVT(LT.second).getTypeForEVT(Tp->getContext()));
3047 SubTp && SubTp->getElementType() == FTp->getElementType())
3048 return divideCeil(FTp->getNumElements(), SubTp->getNumElements());
3049 }
3050 return LT.first.getValue();
3051 }
3052
3055 TTI::TargetCostKind) const override {
3056 return 0;
3057 }
3058
3059 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
3060 /// We're assuming that reduction operation are performing the following way:
3061 ///
3062 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
3063 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
3064 /// \----------------v-------------/ \----------v------------/
3065 /// n/2 elements n/2 elements
3066 /// %red1 = op <n x t> %val, <n x t> val1
3067 /// After this operation we have a vector %red1 where only the first n/2
3068 /// elements are meaningful, the second n/2 elements are undefined and can be
3069 /// dropped. All other operations are actually working with the vector of
3070 /// length n/2, not n, though the real vector length is still n.
3071 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
3072 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
3073 /// \----------------v-------------/ \----------v------------/
3074 /// n/4 elements 3*n/4 elements
3075 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
3076 /// length n/2, the resulting vector has length n/4 etc.
3077 ///
3078 /// The cost model should take into account that the actual length of the
3079 /// vector is reduced on each iteration.
3082 // Targets must implement a default value for the scalable case, since
3083 // we don't know how many lanes the vector has.
3084 if (isa<ScalableVectorType>(Ty))
3086
3087 Type *ScalarTy = Ty->getElementType();
3088 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
3089 if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
3090 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
3091 NumVecElts >= 2) {
3092 // Or reduction for i1 is represented as:
3093 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
3094 // %res = cmp ne iReduxWidth %val, 0
3095 // And reduction for i1 is represented as:
3096 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
3097 // %res = cmp eq iReduxWidth %val, 11111
3098 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
3099 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
3101 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
3104 }
3105 unsigned NumReduxLevels = Log2_32(NumVecElts);
3106 InstructionCost ArithCost = 0;
3107 InstructionCost ShuffleCost = 0;
3108 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
3109 unsigned LongVectorCount = 0;
3110 unsigned MVTLen =
3111 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
3112 while (NumVecElts > MVTLen) {
3113 NumVecElts /= 2;
3114 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
3115 ShuffleCost += thisT()->getShuffleCost(
3116 TTI::SK_ExtractSubvector, SubTy, Ty, {}, CostKind, NumVecElts, SubTy);
3117 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
3118 Ty = SubTy;
3119 ++LongVectorCount;
3120 }
3121
3122 NumReduxLevels -= LongVectorCount;
3123
3124 // The minimal length of the vector is limited by the real length of vector
3125 // operations performed on the current platform. That's why several final
3126 // reduction operations are performed on the vectors with the same
3127 // architecture-dependent length.
3128
3129 // By default reductions need one shuffle per reduction level.
3130 ShuffleCost +=
3131 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
3132 Ty, {}, CostKind, 0, Ty);
3133 ArithCost +=
3134 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
3135 return ShuffleCost + ArithCost +
3136 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
3137 CostKind, 0, nullptr, nullptr);
3138 }
3139
3140 /// Try to calculate the cost of performing strict (in-order) reductions,
3141 /// which involves doing a sequence of floating point additions in lane
3142 /// order, starting with an initial value. For example, consider a scalar
3143 /// initial value 'InitVal' of type float and a vector of type <4 x float>:
3144 ///
3145 /// Vector = <float %v0, float %v1, float %v2, float %v3>
3146 ///
3147 /// %add1 = %InitVal + %v0
3148 /// %add2 = %add1 + %v1
3149 /// %add3 = %add2 + %v2
3150 /// %add4 = %add3 + %v3
3151 ///
3152 /// As a simple estimate we can say the cost of such a reduction is 4 times
3153 /// the cost of a scalar FP addition. We can only estimate the costs for
3154 /// fixed-width vectors here because for scalable vectors we do not know the
3155 /// runtime number of operations.
3158 // Targets must implement a default value for the scalable case, since
3159 // we don't know how many lanes the vector has.
3160 if (isa<ScalableVectorType>(Ty))
3162
3163 auto *VTy = cast<FixedVectorType>(Ty);
3165 VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
3166 InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
3167 Opcode, VTy->getElementType(), CostKind);
3168 ArithCost *= VTy->getNumElements();
3169
3170 return ExtractCost + ArithCost;
3171 }
3172
3175 std::optional<FastMathFlags> FMF,
3176 TTI::TargetCostKind CostKind) const override {
3177 assert(Ty && "Unknown reduction vector type");
3179 return getOrderedReductionCost(Opcode, Ty, CostKind);
3180 return getTreeReductionCost(Opcode, Ty, CostKind);
3181 }
3182
3183 /// Try to calculate op costs for min/max reduction operations.
3184 /// \param CondTy Conditional type for the Select instruction.
3187 TTI::TargetCostKind CostKind) const override {
3188 // Targets must implement a default value for the scalable case, since
3189 // we don't know how many lanes the vector has.
3190 if (isa<ScalableVectorType>(Ty))
3192
3193 Type *ScalarTy = Ty->getElementType();
3194 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
3195 unsigned NumReduxLevels = Log2_32(NumVecElts);
3196 InstructionCost MinMaxCost = 0;
3197 InstructionCost ShuffleCost = 0;
3198 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
3199 unsigned LongVectorCount = 0;
3200 unsigned MVTLen =
3201 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
3202 while (NumVecElts > MVTLen) {
3203 NumVecElts /= 2;
3204 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
3205
3206 ShuffleCost += thisT()->getShuffleCost(
3207 TTI::SK_ExtractSubvector, SubTy, Ty, {}, CostKind, NumVecElts, SubTy);
3208
3209 IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF);
3210 MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind);
3211 Ty = SubTy;
3212 ++LongVectorCount;
3213 }
3214
3215 NumReduxLevels -= LongVectorCount;
3216
3217 // The minimal length of the vector is limited by the real length of vector
3218 // operations performed on the current platform. That's why several final
3219 // reduction opertions are perfomed on the vectors with the same
3220 // architecture-dependent length.
3221 ShuffleCost +=
3222 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
3223 Ty, {}, CostKind, 0, Ty);
3224 IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF);
3225 MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind);
3226 // The last min/max should be in vector registers and we counted it above.
3227 // So just need a single extractelement.
3228 return ShuffleCost + MinMaxCost +
3229 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
3230 CostKind, 0, nullptr, nullptr);
3231 }
3232
3234 getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy,
3235 VectorType *Ty, std::optional<FastMathFlags> FMF,
3236 TTI::TargetCostKind CostKind) const override {
3237 if (auto *FTy = dyn_cast<FixedVectorType>(Ty);
3238 FTy && IsUnsigned && Opcode == Instruction::Add &&
3239 FTy->getElementType() == IntegerType::getInt1Ty(Ty->getContext())) {
3240 // Represent vector_reduce_add(ZExt(<n x i1>)) as
3241 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
3242 auto *IntTy =
3243 IntegerType::get(ResTy->getContext(), FTy->getNumElements());
3244 IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy},
3245 FMF ? *FMF : FastMathFlags());
3246 return thisT()->getCastInstrCost(Instruction::BitCast, IntTy, FTy,
3248 thisT()->getIntrinsicInstrCost(ICA, CostKind);
3249 }
3250 // Without any native support, this is equivalent to the cost of
3251 // vecreduce.opcode(ext(Ty A)).
3252 VectorType *ExtTy = VectorType::get(ResTy, Ty);
3253 InstructionCost RedCost =
3254 thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
3255 InstructionCost ExtCost = thisT()->getCastInstrCost(
3256 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
3258
3259 return RedCost + ExtCost;
3260 }
3261
3263 getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty,
3264 TTI::TargetCostKind CostKind) const override {
3265 // Without any native support, this is equivalent to the cost of
3266 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
3267 // vecreduce.add(mul(A, B)).
3268 VectorType *ExtTy = VectorType::get(ResTy, Ty);
3269 InstructionCost RedCost = thisT()->getArithmeticReductionCost(
3270 Instruction::Add, ExtTy, std::nullopt, CostKind);
3271 InstructionCost ExtCost = thisT()->getCastInstrCost(
3272 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
3274
3275 InstructionCost MulCost =
3276 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
3277
3278 return RedCost + MulCost + 2 * ExtCost;
3279 }
3280
3282
3283 /// @}
3284};
3285
3286/// Concrete BasicTTIImpl that can be used if no further customization
3287/// is needed.
3288class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
3290
3291 friend class BasicTTIImplBase<BasicTTIImpl>;
3292
3293 const TargetSubtargetInfo *ST;
3294 const TargetLoweringBase *TLI;
3295
3296 const TargetSubtargetInfo *getST() const { return ST; }
3297 const TargetLoweringBase *getTLI() const { return TLI; }
3298
3299public:
3300 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
3301};
3302
3303} // end namespace llvm
3304
3305#endif // LLVM_CODEGEN_BASICTTIIMPL_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
This file implements the BitVector class.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint32_t Index
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
static unsigned getNumElements(Type *Ty)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This file provides helpers for the implementation of a TargetTransformInfo-conforming class.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1201
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1130
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
an instruction to allocate memory on the stack
Definition: Instructions.h:64
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:200
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:206
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
Base class which can be used to help build a TTI implementation.
Definition: BasicTTIImpl.h:82
InstructionCost getFPOpCost(Type *Ty) const override
Definition: BasicTTIImpl.h:662
bool preferToKeepConstantsAttached(const Instruction &Inst, const Function &Fn) const override
Definition: BasicTTIImpl.h:672
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty) const override
Definition: BasicTTIImpl.h:496
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
Definition: BasicTTIImpl.h:558
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
Definition: BasicTTIImpl.h:695
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
Definition: BasicTTIImpl.h:875
bool shouldBuildLookupTables() const override
Definition: BasicTTIImpl.h:622
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:888
bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override
Definition: BasicTTIImpl.h:424
bool isProfitableToHoist(Instruction *I) const override
Definition: BasicTTIImpl.h:542
unsigned getNumberOfParts(Type *Tp) const override
virtual bool enableWritePrefetching() const override
Definition: BasicTTIImpl.h:861
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
bool useAA() const override
Definition: BasicTTIImpl.h:546
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const override
Definition: BasicTTIImpl.h:476
bool isLegalAddScalableImmediate(int64_t Imm) const override
Definition: BasicTTIImpl.h:451
unsigned getAssumedAddrSpace(const Value *V) const override
Definition: BasicTTIImpl.h:428
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) const override
Definition: BasicTTIImpl.h:807
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
Definition: BasicTTIImpl.h:459
bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const override
Definition: BasicTTIImpl.h:410
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
Definition: BasicTTIImpl.h:384
bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty) const override
Definition: BasicTTIImpl.h:501
bool haveFastSqrt(Type *Ty) const override
Definition: BasicTTIImpl.h:653
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
Definition: BasicTTIImpl.h:419
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx) const override
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JumpTableSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const override
Definition: BasicTTIImpl.h:564
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
Definition: BasicTTIImpl.h:442
unsigned adjustInliningThreshold(const CallBase *CB) const override
Definition: BasicTTIImpl.h:692
unsigned getInliningThresholdMultiplier() const override
Definition: BasicTTIImpl.h:691
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
virtual std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
Definition: BasicTTIImpl.h:831
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset)
Definition: BasicTTIImpl.h:472
bool shouldBuildRelLookupTables() const override
Definition: BasicTTIImpl.h:628
bool isTargetIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, int RetIdx) const override
Definition: BasicTTIImpl.h:936
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Definition: BasicTTIImpl.h:523
unsigned getEpilogueVectorizationMinVF() const override
Definition: BasicTTIImpl.h:788
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
InstructionCost getVectorSplitCost() const
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Definition: BasicTTIImpl.h:538
std::optional< unsigned > getMaxVScale() const override
Definition: BasicTTIImpl.h:879
unsigned getFlatAddressSpace() const override
Definition: BasicTTIImpl.h:414
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
Compute a cost of the given call instruction.
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition: BasicTTIImpl.h:702
InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind) const
Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const override
Definition: BasicTTIImpl.h:438
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition: BasicTTIImpl.h:774
InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
Get intrinsic cost based on argument types.
bool hasBranchDivergence(const Function *F=nullptr) const override
Definition: BasicTTIImpl.h:398
InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind) const
Try to calculate the cost of performing strict (in-order) reductions, which involves doing a sequence...
bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) const override
Definition: BasicTTIImpl.h:920
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
Definition: BasicTTIImpl.h:792
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace, Align Alignment, unsigned *Fast) const override
Definition: BasicTTIImpl.h:376
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
Definition: BasicTTIImpl.h:802
bool shouldDropLSRSolutionIfLessProfitable() const override
Definition: BasicTTIImpl.h:515
int getInlinerVectorBonusPercent() const override
Definition: BasicTTIImpl.h:700
virtual unsigned getPrefetchDistance() const override
Definition: BasicTTIImpl.h:845
bool isVScaleKnownToBeAPowerOfTwo() const override
Definition: BasicTTIImpl.h:883
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
virtual unsigned getCacheLineSize() const override
Definition: BasicTTIImpl.h:841
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:997
bool isLegalAddImmediate(int64_t imm) const override
Definition: BasicTTIImpl.h:447
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
bool isSingleThreaded() const override
Definition: BasicTTIImpl.h:432
InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
Definition: BasicTTIImpl.h:942
virtual bool shouldPrefetchAddressSpace(unsigned AS) const override
Definition: BasicTTIImpl.h:865
virtual ~BasicTTIImplBase()=default
bool isProfitableLSRChainElement(Instruction *I) const override
Definition: BasicTTIImpl.h:519
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override
Definition: BasicTTIImpl.h:406
bool isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx) const override
Definition: BasicTTIImpl.h:930
bool isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx) const override
Definition: BasicTTIImpl.h:925
std::optional< unsigned > getVScaleForTuning() const override
Definition: BasicTTIImpl.h:880
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
virtual std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
Definition: BasicTTIImpl.h:825
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const override
Definition: BasicTTIImpl.h:797
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
Definition: BasicTTIImpl.h:814
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
bool isSourceOfDivergence(const Value *V) const override
Definition: BasicTTIImpl.h:402
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const override
Definition: BasicTTIImpl.h:660
InstructionCost getScalarizationOverhead(VectorType *RetTy, ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing the inputs and outputs of an instruction, with return type RetTy...
Definition: BasicTTIImpl.h:978
bool isAlwaysUniform(const Value *V) const override
Definition: BasicTTIImpl.h:404
bool isLegalICmpImmediate(int64_t imm) const override
Definition: BasicTTIImpl.h:455
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const override
Definition: BasicTTIImpl.h:849
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const override
Definition: BasicTTIImpl.h:782
unsigned getRegUsageForType(Type *Ty) const override
Definition: BasicTTIImpl.h:553
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
Definition: BasicTTIImpl.h:367
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
Definition: BasicTTIImpl.h:548
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const override
Definition: BasicTTIImpl.h:506
virtual unsigned getMaxPrefetchIterationsAhead() const override
Definition: BasicTTIImpl.h:857
InstructionCost getOperandsScalarizationOverhead(ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
Estimate the overhead of scalarizing an instruction's operands.
Definition: BasicTTIImpl.h:957
bool isNumRegsMajorCostOfLSR() const override
Definition: BasicTTIImpl.h:511
Concrete BasicTTIImpl that can be used if no further customization is needed.
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
BitVector & set()
Definition: BitVector.h:351
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1116
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:984
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:701
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:705
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:703
@ ICMP_EQ
equal
Definition: InstrTypes.h:699
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:688
CmpInst::Predicate getLTPredicate() const
CmpInst::Predicate getGTPredicate() const
This class represents a range of values.
Definition: ConstantRange.h:47
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:481
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:842
unsigned getIndexSizeInBits(unsigned AS) const
The size in bits of indices used for address calculation in getelementptr and for addresses in the gi...
Definition: DataLayout.h:398
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:327
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:312
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:323
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:22
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
unsigned getNumElements() const
Definition: DerivedTypes.h:635
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:352
The core instruction combiner logic.
Definition: InstCombiner.h:48
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:312
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:319
const TargetLibraryInfo * getLibInfo() const
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
InstructionCost getScalarizationCost() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Return the minimum stride necessary to trigger software prefetching.
virtual bool enableWritePrefetching() const
virtual unsigned getMaxPrefetchIterationsAhead() const
Return the maximum prefetch distance in terms of loop iterations.
virtual unsigned getPrefetchDistance() const
Return the preferred prefetch distance in terms of instructions.
virtual std::optional< unsigned > getCacheAssociativity(unsigned Level) const
Return the cache associatvity for the given level of cache.
virtual std::optional< unsigned > getCacheLineSize(unsigned Level) const
Return the target cache line size in bytes at a given level.
Machine Value Type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Analysis providing profile information.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isSpliceMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is a splice mask, concatenating the two inputs together and then ext...
static LLVM_ABI bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isTransposeMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask is a transpose mask.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
size_type size() const
Definition: SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:34
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:44
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:43
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:620
Multiway switch.
Provides information about what library functions are available for the current target.
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
bool isIndexedStoreLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool isLegalICmpImmediate(int64_t) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
const TargetMachine & getTargetMachine() const
virtual bool isZExtFree(Type *FromTy, Type *ToTy) const
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, uint64_t Range, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
Return true if lowering to a jump table is suitable for a set of case clusters which may contain NumC...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
virtual bool isCheapToSpeculateCttz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic cttz.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
unsigned getBitWidthForCttzElements(Type *RetTy, ElementCount EC, bool ZeroIsPoison, const ConstantRange *VScaleRange) const
Return the minimum number of bits required to hold the maximum possible number of trailing zero vecto...
virtual bool shouldExpandCmpUsingSelects(EVT VT) const
Should we expand [US]CMP nodes using two selects and two compares, or by doing arithmetic on boolean ...
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
virtual bool isTruncateFree(Type *FromTy, Type *ToTy) const
Return true if it's free to truncate a value of type FromTy to type ToTy.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps, const APInt &Low, const APInt &High, const DataLayout &DL) const
Return true if lowering to a bit test is suitable for a set of case clusters which contains NumDests ...
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
virtual bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const
Return how this store with truncation should be treated: either it is legal, needs to be promoted to ...
LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return how this load with extension should be treated: either it is legal, needs to be promoted to a ...
virtual bool isIntDivCheap(EVT VT, AttributeList Attr) const
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isProfitableToHoist(Instruction *I) const
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
virtual bool isCheapToSpeculateCtlz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic ctlz.
virtual int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const
Return the prefered common base offset.
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
virtual bool isLegalAddScalableImmediate(int64_t) const
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
virtual bool isFAbsFree(EVT VT) const
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
virtual bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AddrSpace, Instruction *I=nullptr) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
virtual std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
If the specified predicate checks whether a generic pointer falls within a specified address space,...
virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast between SrcAS and DestAS is a noop.
virtual unsigned getAssumedAddrSpace(const Value *V) const
If the specified generic pointer could be assumed as a pointer to a specific address space,...
TargetOptions Options
ThreadModel::Model ThreadModel
ThreadModel - This flag specifies the type of threading model to assume for things like atomics.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual bool useAA() const
Enable use of alias analysis during code generation (during MI scheduling, DAGCombine,...
virtual bool isProfitableLSRChainElement(Instruction *I) const
virtual const DataLayout & getDataLayout() const
virtual std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
virtual std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
virtual bool shouldDropLSRSolutionIfLessProfitable() const
virtual bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const
virtual std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) const
virtual std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
virtual unsigned getEpilogueVectorizationMinVF() const
virtual bool isLoweredToCall(const Function *F) const
virtual InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info, ArrayRef< const Value * > Args, const Instruction *CxtI=nullptr) const
virtual InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
virtual InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) const
virtual InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
virtual InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, const Instruction *I) const
virtual TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
CRTP base class for use as a mix-in that aids implementing a TargetTransformInfo-compatible class.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Basic
The cost of a typical 'add' instruction.
MemIndexedMode
The type of load/store indexing.
@ MIM_PostInc
Post-incrementing.
@ MIM_PostDec
Post-decrementing.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
CacheLevel
The possible cache levels.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:47
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:408
LLVM_ABI bool isArch64Bit() const
Test whether the architecture is 64-bit.
Definition: Triple.cpp:1771
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, DriverKit, XROS, or bridgeOS).
Definition: Triple.h:608
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:273
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
Value * getOperand(unsigned i) const
Definition: User.h:232
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
static LLVM_ABI bool isVPCast(Intrinsic::ID ID)
static LLVM_ABI bool isVPCmp(Intrinsic::ID ID)
static LLVM_ABI std::optional< unsigned > getFunctionalOpcodeForVP(Intrinsic::ID ID)
static LLVM_ABI std::optional< Intrinsic::ID > getFunctionalIntrinsicIDForVP(Intrinsic::ID ID)
static LLVM_ABI bool isVPIntrinsic(Intrinsic::ID)
static LLVM_ABI bool isVPReduction(Intrinsic::ID ID)
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
Provides info so a possible vectorization of a function can be computed.
bool isMasked() const
Base class of all SIMD vector types.
Definition: DerivedTypes.h:430
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:534
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:695
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:463
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:219
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:169
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:3009
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:765
@ SMULFIX
RESULT = [US]MULFIX(LHS, RHS, SCALE) - Perform fixed point multiplication on 2 integers with the same...
Definition: ISDOpcodes.h:387
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:511
@ FMODF
FMODF - Decomposes the operand into integral and fractional parts, each having the same type and sign...
Definition: ISDOpcodes.h:1098
@ FATAN2
FATAN2 - atan2, inspired by libm.
Definition: ISDOpcodes.h:1020
@ FSINCOSPI
FSINCOSPI - Compute both the sine and cosine times pi more accurately than FSINCOS(pi*x),...
Definition: ISDOpcodes.h:1094
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:738
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:275
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:1018
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1090
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:347
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1162
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1166
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:778
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:343
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1075
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:928
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
Definition: ISDOpcodes.h:1059
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:351
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:787
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1081
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition: ISDOpcodes.h:726
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:927
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:360
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1086
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1634
LLVM_ABI bool isTargetIntrinsic(ID IID)
isTargetIntrinsic - Returns true if IID is an intrinsic specific to a certain target.
Definition: Intrinsics.cpp:629
LLVM_ABI Libcall getSINCOSPI(EVT RetVT)
getSINCOSPI - Return the SINCOSPI_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getMODF(EVT RetVT)
getMODF - Return the MODF_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getSINCOS(EVT RetVT)
getSINCOS - Return the SINCOS_* value for the given types, or UNKNOWN_LIBCALL if there is none.
template class LLVM_TEMPLATE_ABI opt< unsigned >
Definition: CommandLine.cpp:82
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:1023
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:870
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
Type * toScalarizedTy(Type *Ty)
A helper for converting vectorized types to scalarized (non-vector) types.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
Definition: LoopUtils.cpp:975
bool isVectorizedTy(Type *Ty)
Returns true if Ty is a vector type or a struct of vector types where all vector types share the same...
constexpr bool has_single_bit(T Value) noexcept
Definition: bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
ElementCount getVectorizedTypeVF(Type *Ty)
Returns the number of vector elements for a vectorized type.
LLVM_ABI ConstantRange getVScaleRange(const Function *F, unsigned BitWidth)
Determine the possible constant range of vscale with the given bit width, based on the vscale_range f...
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
InstructionCost Cost
cl::opt< unsigned > PartialUnrollingThreshold
LLVM_ABI bool isVectorizedStructTy(StructType *StructTy)
Returns true if StructTy is an unpacked literal struct where all elements are vectors of matching ele...
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:345
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:299
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:216
Attributes of a target dependent hardware loop.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
bool AllowPeeling
Allow peeling off loop iterations.
bool AllowLoopNestsPeeling
Allow peeling off loop iterations for loop nests.
bool PeelProfiledIterations
Allow peeling basing on profile.
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).