LLVM 21.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
19#include <cmath>
20#include <optional>
21using namespace llvm;
22using namespace llvm::PatternMatch;
23
24#define DEBUG_TYPE "riscvtti"
25
27 "riscv-v-register-bit-width-lmul",
29 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
30 "by autovectorized code. Fractional LMULs are not supported."),
32
34 "riscv-v-slp-max-vf",
36 "Overrides result used for getMaximumVF query which is used "
37 "exclusively by SLP vectorizer."),
39
41RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
43 // Check if the type is valid for all CostKind
44 if (!VT.isVector())
46 size_t NumInstr = OpCodes.size();
48 return NumInstr;
49 InstructionCost LMULCost = TLI->getLMULCost(VT);
51 return LMULCost * NumInstr;
53 for (auto Op : OpCodes) {
54 switch (Op) {
55 case RISCV::VRGATHER_VI:
56 Cost += TLI->getVRGatherVICost(VT);
57 break;
58 case RISCV::VRGATHER_VV:
59 Cost += TLI->getVRGatherVVCost(VT);
60 break;
61 case RISCV::VSLIDEUP_VI:
62 case RISCV::VSLIDEDOWN_VI:
63 Cost += TLI->getVSlideVICost(VT);
64 break;
65 case RISCV::VSLIDEUP_VX:
66 case RISCV::VSLIDEDOWN_VX:
67 Cost += TLI->getVSlideVXCost(VT);
68 break;
69 case RISCV::VREDMAX_VS:
70 case RISCV::VREDMIN_VS:
71 case RISCV::VREDMAXU_VS:
72 case RISCV::VREDMINU_VS:
73 case RISCV::VREDSUM_VS:
74 case RISCV::VREDAND_VS:
75 case RISCV::VREDOR_VS:
76 case RISCV::VREDXOR_VS:
77 case RISCV::VFREDMAX_VS:
78 case RISCV::VFREDMIN_VS:
79 case RISCV::VFREDUSUM_VS: {
80 unsigned VL = VT.getVectorMinNumElements();
81 if (!VT.isFixedLengthVector())
82 VL *= *getVScaleForTuning();
83 Cost += Log2_32_Ceil(VL);
84 break;
85 }
86 case RISCV::VFREDOSUM_VS: {
87 unsigned VL = VT.getVectorMinNumElements();
88 if (!VT.isFixedLengthVector())
89 VL *= *getVScaleForTuning();
90 Cost += VL;
91 break;
92 }
93 case RISCV::VMV_X_S:
94 case RISCV::VMV_S_X:
95 case RISCV::VFMV_F_S:
96 case RISCV::VFMV_S_F:
97 case RISCV::VMOR_MM:
98 case RISCV::VMXOR_MM:
99 case RISCV::VMAND_MM:
100 case RISCV::VMANDN_MM:
101 case RISCV::VMNAND_MM:
102 case RISCV::VCPOP_M:
103 case RISCV::VFIRST_M:
104 Cost += 1;
105 break;
106 default:
107 Cost += LMULCost;
108 }
109 }
110 return Cost;
111}
112
114 const RISCVSubtarget *ST,
115 const APInt &Imm, Type *Ty,
117 bool FreeZeroes) {
118 assert(Ty->isIntegerTy() &&
119 "getIntImmCost can only estimate cost of materialising integers");
120
121 // We have a Zero register, so 0 is always free.
122 if (Imm == 0)
123 return TTI::TCC_Free;
124
125 // Otherwise, we check how many instructions it will take to materialise.
126 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
127 /*CompressionCost=*/false, FreeZeroes);
128}
129
132 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
133}
134
135// Look for patterns of shift followed by AND that can be turned into a pair of
136// shifts. We won't need to materialize an immediate for the AND so these can
137// be considered free.
138static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
139 uint64_t Mask = Imm.getZExtValue();
140 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
141 if (!BO || !BO->hasOneUse())
142 return false;
143
144 if (BO->getOpcode() != Instruction::Shl)
145 return false;
146
147 if (!isa<ConstantInt>(BO->getOperand(1)))
148 return false;
149
150 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
151 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
152 // is a mask shifted by c2 bits with c3 leading zeros.
153 if (isShiftedMask_64(Mask)) {
154 unsigned Trailing = llvm::countr_zero(Mask);
155 if (ShAmt == Trailing)
156 return true;
157 }
158
159 return false;
160}
161
163 const APInt &Imm, Type *Ty,
165 Instruction *Inst) {
166 assert(Ty->isIntegerTy() &&
167 "getIntImmCost can only estimate cost of materialising integers");
168
169 // We have a Zero register, so 0 is always free.
170 if (Imm == 0)
171 return TTI::TCC_Free;
172
173 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
174 // commutative, in others the immediate comes from a specific argument index.
175 bool Takes12BitImm = false;
176 unsigned ImmArgIdx = ~0U;
177
178 switch (Opcode) {
179 case Instruction::GetElementPtr:
180 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
181 // split up large offsets in GEP into better parts than ConstantHoisting
182 // can.
183 return TTI::TCC_Free;
184 case Instruction::Store: {
185 // Use the materialization cost regardless of if it's the address or the
186 // value that is constant, except for if the store is misaligned and
187 // misaligned accesses are not legal (experience shows constant hoisting
188 // can sometimes be harmful in such cases).
189 if (Idx == 1 || !Inst)
190 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
191 /*FreeZeroes=*/true);
192
193 StoreInst *ST = cast<StoreInst>(Inst);
194 if (!getTLI()->allowsMemoryAccessForAlignment(
195 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
196 ST->getPointerAddressSpace(), ST->getAlign()))
197 return TTI::TCC_Free;
198
199 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
200 /*FreeZeroes=*/true);
201 }
202 case Instruction::Load:
203 // If the address is a constant, use the materialization cost.
204 return getIntImmCost(Imm, Ty, CostKind);
205 case Instruction::And:
206 // zext.h
207 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
208 return TTI::TCC_Free;
209 // zext.w
210 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
211 return TTI::TCC_Free;
212 // bclri
213 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
214 return TTI::TCC_Free;
215 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
216 canUseShiftPair(Inst, Imm))
217 return TTI::TCC_Free;
218 Takes12BitImm = true;
219 break;
220 case Instruction::Add:
221 Takes12BitImm = true;
222 break;
223 case Instruction::Or:
224 case Instruction::Xor:
225 // bseti/binvi
226 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
227 return TTI::TCC_Free;
228 Takes12BitImm = true;
229 break;
230 case Instruction::Mul:
231 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
232 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
233 return TTI::TCC_Free;
234 // One more or less than a power of 2 can use SLLI+ADD/SUB.
235 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
236 return TTI::TCC_Free;
237 // FIXME: There is no MULI instruction.
238 Takes12BitImm = true;
239 break;
240 case Instruction::Sub:
241 case Instruction::Shl:
242 case Instruction::LShr:
243 case Instruction::AShr:
244 Takes12BitImm = true;
245 ImmArgIdx = 1;
246 break;
247 default:
248 break;
249 }
250
251 if (Takes12BitImm) {
252 // Check immediate is the correct argument...
253 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
254 // ... and fits into the 12-bit immediate.
255 if (Imm.getSignificantBits() <= 64 &&
256 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
257 return TTI::TCC_Free;
258 }
259 }
260
261 // Otherwise, use the full materialisation cost.
262 return getIntImmCost(Imm, Ty, CostKind);
263 }
264
265 // By default, prevent hoisting.
266 return TTI::TCC_Free;
267}
268
271 const APInt &Imm, Type *Ty,
273 // Prevent hoisting in unknown cases.
274 return TTI::TCC_Free;
275}
276
277bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
278 return ST->hasVInstructions();
279}
280
283 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
284 return ST->hasStdExtZbb() || (ST->hasVendorXCVbitmanip() && !ST->is64Bit())
287}
288
290 // Currently, the ExpandReductions pass can't expand scalable-vector
291 // reductions, but we still request expansion as RVV doesn't support certain
292 // reductions and the SelectionDAG can't legalize them either.
293 switch (II->getIntrinsicID()) {
294 default:
295 return false;
296 // These reductions have no equivalent in RVV
297 case Intrinsic::vector_reduce_mul:
298 case Intrinsic::vector_reduce_fmul:
299 return true;
300 }
301}
302
303std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
304 if (ST->hasVInstructions())
306 return BaseT::getMaxVScale();
307}
308
309std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
310 if (ST->hasVInstructions())
311 if (unsigned MinVLen = ST->getRealMinVLen();
312 MinVLen >= RISCV::RVVBitsPerBlock)
313 return MinVLen / RISCV::RVVBitsPerBlock;
315}
316
319 unsigned LMUL =
320 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
321 switch (K) {
323 return TypeSize::getFixed(ST->getXLen());
325 return TypeSize::getFixed(
326 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
329 (ST->hasVInstructions() &&
332 : 0);
333 }
334
335 llvm_unreachable("Unsupported register kind");
336}
337
339RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
340 // Add a cost of address generation + the cost of the load. The address
341 // is expected to be a PC relative offset to a constant pool entry
342 // using auipc/addi.
343 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
344 /*AddressSpace=*/0, CostKind);
345}
346
347static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
348 unsigned Size = Mask.size();
349 if (!isPowerOf2_32(Size))
350 return false;
351 for (unsigned I = 0; I != Size; ++I) {
352 if (static_cast<unsigned>(Mask[I]) == I)
353 continue;
354 if (Mask[I] != 0)
355 return false;
356 if (Size % I != 0)
357 return false;
358 for (unsigned J = I + 1; J != Size; ++J)
359 // Check the pattern is repeated.
360 if (static_cast<unsigned>(Mask[J]) != J % I)
361 return false;
362 SubVectorSize = I;
363 return true;
364 }
365 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
366 return false;
367}
368
370 LLVMContext &C) {
371 assert((DataVT.getScalarSizeInBits() != 8 ||
372 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
373 MVT IndexVT = DataVT.changeTypeToInteger();
374 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
375 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
376 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
377}
378
379/// Try to perform better estimation of the permutation.
380/// 1. Split the source/destination vectors into real registers.
381/// 2. Do the mask analysis to identify which real registers are
382/// permuted. If more than 1 source registers are used for the
383/// destination register building, the cost for this destination register
384/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
385/// source register is used, build mask and calculate the cost as a cost
386/// of PermuteSingleSrc.
387/// Also, for the single register permute we try to identify if the
388/// destination register is just a copy of the source register or the
389/// copy of the previous destination register (the cost is
390/// TTI::TCC_Basic). If the source register is just reused, the cost for
391/// this operation is 0.
392static InstructionCost
394 std::optional<unsigned> VLen, VectorType *Tp,
397 if (VLen && LegalVT.isFixedLengthVector() && !Mask.empty()) {
398 MVT ElemVT = LegalVT.getVectorElementType();
399 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
400 LegalVT = TTI.getTypeLegalizationCost(
401 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
402 .second;
403 // Number of destination vectors after legalization:
404 NumOfDests = divideCeil(Mask.size(), LegalVT.getVectorNumElements());
405 }
406 if (!NumOfDests.isValid() || NumOfDests <= 1 ||
407 !LegalVT.isFixedLengthVector() ||
409 Tp->getElementType()->getPrimitiveSizeInBits() ||
410 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
412
413 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
414 unsigned LegalVTSize = LegalVT.getStoreSize();
415 // Number of source vectors after legalization:
416 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
417
418 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
419 LegalVT.getVectorNumElements());
420
421 unsigned E = *NumOfDests.getValue();
422 unsigned NormalizedVF =
423 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
424 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
425 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
426 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
427 assert(NormalizedVF >= Mask.size() &&
428 "Normalized mask expected to be not shorter than original mask.");
429 copy(Mask, NormalizedMask.begin());
431 SmallBitVector ExtractedRegs(2 * NumOfSrcRegs);
432 int NumShuffles = 0;
434 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
435 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
436 if (ExtractedRegs.test(SrcReg)) {
438 (SrcReg % NumOfSrcRegs) *
439 SingleOpTy->getNumElements(),
440 SingleOpTy);
441 ExtractedRegs.set(SrcReg);
442 }
443 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
444 ++NumShuffles;
446 RegMask, CostKind, 0, nullptr);
447 return;
448 }
449 },
450 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
451 if (ExtractedRegs.test(Idx1)) {
454 (Idx1 % NumOfSrcRegs) * SingleOpTy->getNumElements(), SingleOpTy);
455 ExtractedRegs.set(Idx1);
456 }
457 if (ExtractedRegs.test(Idx2)) {
460 (Idx2 % NumOfSrcRegs) * SingleOpTy->getNumElements(), SingleOpTy);
461 ExtractedRegs.set(Idx2);
462 }
463 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
464 CostKind, 0, nullptr);
465 NumShuffles += 2;
466 });
467 // Note: check that we do not emit too many shuffles here to prevent code
468 // size explosion.
469 // TODO: investigate, if it can be improved by extra analysis of the masks
470 // to check if the code is more profitable.
471 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
472 (NumOfDestRegs <= 2 && NumShuffles < 4))
473 return Cost;
475}
476
478 VectorType *Tp, ArrayRef<int> Mask,
480 int Index, VectorType *SubTp,
482 const Instruction *CxtI) {
483 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
484
485 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
486
487 // First, handle cases where having a fixed length vector enables us to
488 // give a more accurate cost than falling back to generic scalable codegen.
489 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
490 if (ST->hasVInstructions() && isa<FixedVectorType>(Tp)) {
492 *this, LT.second, ST->getRealVLen(), Tp, Mask, CostKind);
493 if (VRegSplittingCost.isValid())
494 return VRegSplittingCost;
495 switch (Kind) {
496 default:
497 break;
499 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
500 MVT EltTp = LT.second.getVectorElementType();
501 // If the size of the element is < ELEN then shuffles of interleaves and
502 // deinterleaves of 2 vectors can be lowered into the following
503 // sequences
504 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
505 // Example sequence:
506 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
507 // vwaddu.vv v10, v8, v9
508 // li a0, -1 (ignored)
509 // vwmaccu.vx v10, a0, v9
510 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
511 return 2 * LT.first * TLI->getLMULCost(LT.second);
512
513 if (Mask[0] == 0 || Mask[0] == 1) {
514 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
515 // Example sequence:
516 // vnsrl.wi v10, v8, 0
517 if (equal(DeinterleaveMask, Mask))
518 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
519 LT.second, CostKind);
520 }
521 }
522 int SubVectorSize;
523 if (LT.second.getScalarSizeInBits() != 1 &&
524 isRepeatedConcatMask(Mask, SubVectorSize)) {
526 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
527 // The cost of extraction from a subvector is 0 if the index is 0.
528 for (unsigned I = 0; I != NumSlides; ++I) {
529 unsigned InsertIndex = SubVectorSize * (1 << I);
530 FixedVectorType *SubTp =
531 FixedVectorType::get(Tp->getElementType(), InsertIndex);
532 FixedVectorType *DestTp =
534 std::pair<InstructionCost, MVT> DestLT =
536 // Add the cost of whole vector register move because the
537 // destination vector register group for vslideup cannot overlap the
538 // source.
539 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
541 CostKind, InsertIndex, SubTp);
542 }
543 return Cost;
544 }
545 }
546 // vrgather + cost of generating the mask constant.
547 // We model this for an unknown mask with a single vrgather.
548 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
549 (LT.second.getScalarSizeInBits() != 8 ||
550 LT.second.getVectorNumElements() <= 256)) {
551 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
552 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
553 return IndexCost +
554 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
555 }
556 [[fallthrough]];
557 }
560 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
561 // register for the second vrgather. We model this for an unknown
562 // (shuffle) mask.
563 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
564 (LT.second.getScalarSizeInBits() != 8 ||
565 LT.second.getVectorNumElements() <= 256)) {
566 auto &C = Tp->getContext();
567 auto EC = Tp->getElementCount();
568 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
570 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
571 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
572 return 2 * IndexCost +
573 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
574 LT.second, CostKind) +
575 MaskCost;
576 }
577 [[fallthrough]];
578 }
579 case TTI::SK_Select: {
580 // We are going to permute multiple sources and the result will be in
581 // multiple destinations. Providing an accurate cost only for splits where
582 // the element type remains the same.
583 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
584 LT.second.isFixedLengthVector() &&
585 LT.second.getVectorElementType().getSizeInBits() ==
587 LT.second.getVectorNumElements() <
588 cast<FixedVectorType>(Tp)->getNumElements() &&
589 divideCeil(Mask.size(),
590 cast<FixedVectorType>(Tp)->getNumElements()) ==
591 static_cast<unsigned>(*LT.first.getValue())) {
592 unsigned NumRegs = *LT.first.getValue();
593 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
594 unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
595 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
596
598 for (unsigned I = 0, NumSrcRegs = divideCeil(Mask.size(), SubVF);
599 I < NumSrcRegs; ++I) {
600 bool IsSingleVector = true;
601 SmallVector<int> SubMask(SubVF, PoisonMaskElem);
602 transform(
603 Mask.slice(I * SubVF,
604 I == NumSrcRegs - 1 ? Mask.size() % SubVF : SubVF),
605 SubMask.begin(), [&](int I) -> int {
606 if (I == PoisonMaskElem)
607 return PoisonMaskElem;
608 bool SingleSubVector = I / VF == 0;
609 IsSingleVector &= SingleSubVector;
610 return (SingleSubVector ? 0 : 1) * SubVF + (I % VF) % SubVF;
611 });
612 if (all_of(enumerate(SubMask), [](auto &&P) {
613 return P.value() == PoisonMaskElem ||
614 static_cast<unsigned>(P.value()) == P.index();
615 }))
616 continue;
619 SubVecTy, SubMask, CostKind, 0, nullptr);
620 }
621 return Cost;
622 }
623 break;
624 }
625 }
626 };
627
628 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
629 switch (Kind) {
630 default:
631 // Fallthrough to generic handling.
632 // TODO: Most of these cases will return getInvalid in generic code, and
633 // must be implemented here.
634 break;
636 // Extract at zero is always a subregister extract
637 if (Index == 0)
638 return TTI::TCC_Free;
639
640 // If we're extracting a subvector of at most m1 size at a sub-register
641 // boundary - which unfortunately we need exact vlen to identify - this is
642 // a subregister extract at worst and thus won't require a vslidedown.
643 // TODO: Extend for aligned m2, m4 subvector extracts
644 // TODO: Extend for misalgined (but contained) extracts
645 // TODO: Extend for scalable subvector types
646 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
647 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
648 const unsigned MinVLen = ST->getRealMinVLen();
649 const unsigned MaxVLen = ST->getRealMaxVLen();
650 if (MinVLen == MaxVLen &&
651 SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 &&
652 SubLT.second.getSizeInBits() <= MinVLen)
653 return TTI::TCC_Free;
654 }
655
656 // Example sequence:
657 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
658 // vslidedown.vi v8, v9, 2
659 return LT.first *
660 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
662 // Example sequence:
663 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
664 // vslideup.vi v8, v9, 2
665 return LT.first *
666 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
667 case TTI::SK_Select: {
668 // Example sequence:
669 // li a0, 90
670 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
671 // vmv.s.x v0, a0
672 // vmerge.vvm v8, v9, v8, v0
673 // We use 2 for the cost of the mask materialization as this is the true
674 // cost for small masks and most shuffles are small. At worst, this cost
675 // should be a very small constant for the constant pool load. As such,
676 // we may bias towards large selects slightly more than truly warranted.
677 return LT.first *
678 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
679 LT.second, CostKind));
680 }
681 case TTI::SK_Broadcast: {
682 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
683 Instruction::InsertElement);
684 if (LT.second.getScalarSizeInBits() == 1) {
685 if (HasScalar) {
686 // Example sequence:
687 // andi a0, a0, 1
688 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
689 // vmv.v.x v8, a0
690 // vmsne.vi v0, v8, 0
691 return LT.first *
692 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
693 LT.second, CostKind));
694 }
695 // Example sequence:
696 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
697 // vmv.v.i v8, 0
698 // vmerge.vim v8, v8, 1, v0
699 // vmv.x.s a0, v8
700 // andi a0, a0, 1
701 // vmv.v.x v8, a0
702 // vmsne.vi v0, v8, 0
703
704 return LT.first *
705 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
706 RISCV::VMV_X_S, RISCV::VMV_V_X,
707 RISCV::VMSNE_VI},
708 LT.second, CostKind));
709 }
710
711 if (HasScalar) {
712 // Example sequence:
713 // vmv.v.x v8, a0
714 return LT.first *
715 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
716 }
717
718 // Example sequence:
719 // vrgather.vi v9, v8, 0
720 return LT.first *
721 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
722 }
723 case TTI::SK_Splice: {
724 // vslidedown+vslideup.
725 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
726 // of similar code, but I think we expand through memory.
727 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
728 if (Index >= 0 && Index < 32)
729 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
730 else if (Index < 0 && Index > -32)
731 Opcodes[1] = RISCV::VSLIDEUP_VI;
732 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
733 }
734 case TTI::SK_Reverse: {
735 // TODO: Cases to improve here:
736 // * Illegal vector types
737 // * i64 on RV32
738 // * i1 vector
739 // At low LMUL, most of the cost is producing the vrgather index register.
740 // At high LMUL, the cost of the vrgather itself will dominate.
741 // Example sequence:
742 // csrr a0, vlenb
743 // srli a0, a0, 3
744 // addi a0, a0, -1
745 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
746 // vid.v v9
747 // vrsub.vx v10, v9, a0
748 // vrgather.vv v9, v8, v10
749 InstructionCost LenCost = 3;
750 if (LT.second.isFixedLengthVector())
751 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
752 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
753 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
754 if (LT.second.isFixedLengthVector() &&
755 isInt<5>(LT.second.getVectorNumElements() - 1))
756 Opcodes[1] = RISCV::VRSUB_VI;
757 InstructionCost GatherCost =
758 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
759 // Mask operation additionally required extend and truncate
760 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
761 return LT.first * (LenCost + GatherCost + ExtendCost);
762 }
763 }
764 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
765}
766
767static unsigned isM1OrSmaller(MVT VT) {
769 return (LMUL == RISCVII::VLMUL::LMUL_F8 || LMUL == RISCVII::VLMUL::LMUL_F4 ||
771}
772
774 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
776 if (isa<ScalableVectorType>(Ty))
778
779 // A build_vector (which is m1 sized or smaller) can be done in no
780 // worse than one vslide1down.vx per element in the type. We could
781 // in theory do an explode_vector in the inverse manner, but our
782 // lowering today does not have a first class node for this pattern.
784 Ty, DemandedElts, Insert, Extract, CostKind);
785 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
786 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
787 if (Ty->getScalarSizeInBits() == 1) {
788 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
789 // Note: Implicit scalar anyextend is assumed to be free since the i1
790 // must be stored in a GPR.
791 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
792 CostKind) +
793 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
795 }
796
797 assert(LT.second.isFixedLengthVector());
798 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
799 if (isM1OrSmaller(ContainerVT)) {
800 InstructionCost BV =
801 cast<FixedVectorType>(Ty)->getNumElements() *
802 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
803 if (BV < Cost)
804 Cost = BV;
805 }
806 }
807 return Cost;
808}
809
811RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
812 unsigned AddressSpace,
814 if (!isLegalMaskedLoadStore(Src, Alignment) ||
816 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
817 CostKind);
818
819 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
820}
821
823 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
824 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
825 bool UseMaskForCond, bool UseMaskForGaps) {
826
827 // The interleaved memory access pass will lower interleaved memory ops (i.e
828 // a load and store followed by a specific shuffle) to vlseg/vsseg
829 // intrinsics.
830 if (!UseMaskForCond && !UseMaskForGaps &&
831 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
832 auto *VTy = cast<VectorType>(VecTy);
833 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
834 // Need to make sure type has't been scalarized
835 if (LT.second.isVector()) {
836 auto *SubVecTy =
837 VectorType::get(VTy->getElementType(),
838 VTy->getElementCount().divideCoefficientBy(Factor));
839 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
840 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
841 AddressSpace, DL)) {
842
843 // Some processors optimize segment loads/stores as one wide memory op +
844 // Factor * LMUL shuffle ops.
845 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
847 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
848 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
849 Cost += Factor * TLI->getLMULCost(SubVecVT);
850 return LT.first * Cost;
851 }
852
853 // Otherwise, the cost is proportional to the number of elements (VL *
854 // Factor ops).
855 InstructionCost MemOpCost =
856 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
857 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
858 unsigned NumLoads = getEstimatedVLFor(VTy);
859 return NumLoads * MemOpCost;
860 }
861 }
862 }
863
864 // TODO: Return the cost of interleaved accesses for scalable vector when
865 // unable to convert to segment accesses instructions.
866 if (isa<ScalableVectorType>(VecTy))
868
869 auto *FVTy = cast<FixedVectorType>(VecTy);
870 InstructionCost MemCost =
871 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
872 unsigned VF = FVTy->getNumElements() / Factor;
873
874 // An interleaved load will look like this for Factor=3:
875 // %wide.vec = load <12 x i32>, ptr %3, align 4
876 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
877 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
878 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
879 if (Opcode == Instruction::Load) {
880 InstructionCost Cost = MemCost;
881 for (unsigned Index : Indices) {
882 FixedVectorType *SubVecTy =
883 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
884 auto Mask = createStrideMask(Index, Factor, VF);
885 InstructionCost ShuffleCost =
887 CostKind, 0, nullptr, {});
888 Cost += ShuffleCost;
889 }
890 return Cost;
891 }
892
893 // TODO: Model for NF > 2
894 // We'll need to enhance getShuffleCost to model shuffles that are just
895 // inserts and extracts into subvectors, since they won't have the full cost
896 // of a vrgather.
897 // An interleaved store for 3 vectors of 4 lanes will look like
898 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
899 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
900 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
901 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
902 // store <12 x i32> %interleaved.vec, ptr %10, align 4
903 if (Factor != 2)
904 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
905 Alignment, AddressSpace, CostKind,
906 UseMaskForCond, UseMaskForGaps);
907
908 assert(Opcode == Instruction::Store && "Opcode must be a store");
909 // For an interleaving store of 2 vectors, we perform one large interleaving
910 // shuffle that goes into the wide store
911 auto Mask = createInterleaveMask(VF, Factor);
912 InstructionCost ShuffleCost =
914 CostKind, 0, nullptr, {});
915 return MemCost + ShuffleCost;
916}
917
919 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
920 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
922 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
923 Alignment, CostKind, I);
924
925 if ((Opcode == Instruction::Load &&
926 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
927 (Opcode == Instruction::Store &&
928 !isLegalMaskedScatter(DataTy, Align(Alignment))))
929 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
930 Alignment, CostKind, I);
931
932 // Cost is proportional to the number of memory operations implied. For
933 // scalable vectors, we use an estimate on that number since we don't
934 // know exactly what VL will be.
935 auto &VTy = *cast<VectorType>(DataTy);
936 InstructionCost MemOpCost =
937 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
938 {TTI::OK_AnyValue, TTI::OP_None}, I);
939 unsigned NumLoads = getEstimatedVLFor(&VTy);
940 return NumLoads * MemOpCost;
941}
942
944 unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
946 bool IsLegal = (Opcode == Instruction::Store &&
947 isLegalMaskedCompressStore(DataTy, Alignment)) ||
948 (Opcode == Instruction::Load &&
949 isLegalMaskedExpandLoad(DataTy, Alignment));
950 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
951 return BaseT::getExpandCompressMemoryOpCost(Opcode, DataTy, VariableMask,
952 Alignment, CostKind, I);
953 // Example compressstore sequence:
954 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
955 // vcompress.vm v10, v8, v0
956 // vcpop.m a1, v0
957 // vsetvli zero, a1, e32, m2, ta, ma
958 // vse32.v v10, (a0)
959 // Example expandload sequence:
960 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
961 // vcpop.m a1, v0
962 // vsetvli zero, a1, e32, m2, ta, ma
963 // vle32.v v10, (a0)
964 // vsetivli zero, 8, e32, m2, ta, ma
965 // viota.m v12, v0
966 // vrgather.vv v8, v10, v12, v0.t
967 auto MemOpCost =
968 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
969 auto LT = getTypeLegalizationCost(DataTy);
970 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
971 if (VariableMask)
972 Opcodes.push_back(RISCV::VCPOP_M);
973 if (Opcode == Instruction::Store)
974 Opcodes.append({RISCV::VCOMPRESS_VM});
975 else
976 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
977 return MemOpCost +
978 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
979}
980
982 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
983 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
984 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
985 !isLegalStridedLoadStore(DataTy, Alignment)) ||
986 (Opcode != Instruction::Load && Opcode != Instruction::Store))
987 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
988 Alignment, CostKind, I);
989
991 return TTI::TCC_Basic;
992
993 // Cost is proportional to the number of memory operations implied. For
994 // scalable vectors, we use an estimate on that number since we don't
995 // know exactly what VL will be.
996 auto &VTy = *cast<VectorType>(DataTy);
997 InstructionCost MemOpCost =
998 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
999 {TTI::OK_AnyValue, TTI::OP_None}, I);
1000 unsigned NumLoads = getEstimatedVLFor(&VTy);
1001 return NumLoads * MemOpCost;
1002}
1003
1006 // FIXME: This is a property of the default vector convention, not
1007 // all possible calling conventions. Fixing that will require
1008 // some TTI API and SLP rework.
1011 for (auto *Ty : Tys) {
1012 if (!Ty->isVectorTy())
1013 continue;
1014 Align A = DL.getPrefTypeAlign(Ty);
1015 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1016 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1017 }
1018 return Cost;
1019}
1020
1021// Currently, these represent both throughput and codesize costs
1022// for the respective intrinsics. The costs in this table are simply
1023// instruction counts with the following adjustments made:
1024// * One vsetvli is considered free.
1026 {Intrinsic::floor, MVT::f32, 9},
1027 {Intrinsic::floor, MVT::f64, 9},
1028 {Intrinsic::ceil, MVT::f32, 9},
1029 {Intrinsic::ceil, MVT::f64, 9},
1030 {Intrinsic::trunc, MVT::f32, 7},
1031 {Intrinsic::trunc, MVT::f64, 7},
1032 {Intrinsic::round, MVT::f32, 9},
1033 {Intrinsic::round, MVT::f64, 9},
1034 {Intrinsic::roundeven, MVT::f32, 9},
1035 {Intrinsic::roundeven, MVT::f64, 9},
1036 {Intrinsic::rint, MVT::f32, 7},
1037 {Intrinsic::rint, MVT::f64, 7},
1038 {Intrinsic::lrint, MVT::i32, 1},
1039 {Intrinsic::lrint, MVT::i64, 1},
1040 {Intrinsic::llrint, MVT::i64, 1},
1041 {Intrinsic::nearbyint, MVT::f32, 9},
1042 {Intrinsic::nearbyint, MVT::f64, 9},
1043 {Intrinsic::bswap, MVT::i16, 3},
1044 {Intrinsic::bswap, MVT::i32, 12},
1045 {Intrinsic::bswap, MVT::i64, 31},
1046 {Intrinsic::vp_bswap, MVT::i16, 3},
1047 {Intrinsic::vp_bswap, MVT::i32, 12},
1048 {Intrinsic::vp_bswap, MVT::i64, 31},
1049 {Intrinsic::vp_fshl, MVT::i8, 7},
1050 {Intrinsic::vp_fshl, MVT::i16, 7},
1051 {Intrinsic::vp_fshl, MVT::i32, 7},
1052 {Intrinsic::vp_fshl, MVT::i64, 7},
1053 {Intrinsic::vp_fshr, MVT::i8, 7},
1054 {Intrinsic::vp_fshr, MVT::i16, 7},
1055 {Intrinsic::vp_fshr, MVT::i32, 7},
1056 {Intrinsic::vp_fshr, MVT::i64, 7},
1057 {Intrinsic::bitreverse, MVT::i8, 17},
1058 {Intrinsic::bitreverse, MVT::i16, 24},
1059 {Intrinsic::bitreverse, MVT::i32, 33},
1060 {Intrinsic::bitreverse, MVT::i64, 52},
1061 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1062 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1063 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1064 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1065 {Intrinsic::ctpop, MVT::i8, 12},
1066 {Intrinsic::ctpop, MVT::i16, 19},
1067 {Intrinsic::ctpop, MVT::i32, 20},
1068 {Intrinsic::ctpop, MVT::i64, 21},
1069 {Intrinsic::ctlz, MVT::i8, 19},
1070 {Intrinsic::ctlz, MVT::i16, 28},
1071 {Intrinsic::ctlz, MVT::i32, 31},
1072 {Intrinsic::ctlz, MVT::i64, 35},
1073 {Intrinsic::cttz, MVT::i8, 16},
1074 {Intrinsic::cttz, MVT::i16, 23},
1075 {Intrinsic::cttz, MVT::i32, 24},
1076 {Intrinsic::cttz, MVT::i64, 25},
1077 {Intrinsic::vp_ctpop, MVT::i8, 12},
1078 {Intrinsic::vp_ctpop, MVT::i16, 19},
1079 {Intrinsic::vp_ctpop, MVT::i32, 20},
1080 {Intrinsic::vp_ctpop, MVT::i64, 21},
1081 {Intrinsic::vp_ctlz, MVT::i8, 19},
1082 {Intrinsic::vp_ctlz, MVT::i16, 28},
1083 {Intrinsic::vp_ctlz, MVT::i32, 31},
1084 {Intrinsic::vp_ctlz, MVT::i64, 35},
1085 {Intrinsic::vp_cttz, MVT::i8, 16},
1086 {Intrinsic::vp_cttz, MVT::i16, 23},
1087 {Intrinsic::vp_cttz, MVT::i32, 24},
1088 {Intrinsic::vp_cttz, MVT::i64, 25},
1089};
1090
1092 switch (ID) {
1093#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
1094 case Intrinsic::VPID: \
1095 return ISD::VPSD;
1096#include "llvm/IR/VPIntrinsics.def"
1097#undef HELPER_MAP_VPID_TO_VPSD
1098 }
1099 return ISD::DELETED_NODE;
1100}
1101
1105 auto *RetTy = ICA.getReturnType();
1106 switch (ICA.getID()) {
1107 case Intrinsic::lrint:
1108 case Intrinsic::llrint:
1109 // We can't currently lower half or bfloat vector lrint/llrint.
1110 if (auto *VecTy = dyn_cast<VectorType>(ICA.getArgTypes()[0]);
1111 VecTy && VecTy->getElementType()->is16bitFPTy())
1113 [[fallthrough]];
1114 case Intrinsic::ceil:
1115 case Intrinsic::floor:
1116 case Intrinsic::trunc:
1117 case Intrinsic::rint:
1118 case Intrinsic::round:
1119 case Intrinsic::roundeven: {
1120 // These all use the same code.
1121 auto LT = getTypeLegalizationCost(RetTy);
1122 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1123 return LT.first * 8;
1124 break;
1125 }
1126 case Intrinsic::umin:
1127 case Intrinsic::umax:
1128 case Intrinsic::smin:
1129 case Intrinsic::smax: {
1130 auto LT = getTypeLegalizationCost(RetTy);
1131 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1132 return LT.first;
1133
1134 if (ST->hasVInstructions() && LT.second.isVector()) {
1135 unsigned Op;
1136 switch (ICA.getID()) {
1137 case Intrinsic::umin:
1138 Op = RISCV::VMINU_VV;
1139 break;
1140 case Intrinsic::umax:
1141 Op = RISCV::VMAXU_VV;
1142 break;
1143 case Intrinsic::smin:
1144 Op = RISCV::VMIN_VV;
1145 break;
1146 case Intrinsic::smax:
1147 Op = RISCV::VMAX_VV;
1148 break;
1149 }
1150 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1151 }
1152 break;
1153 }
1154 case Intrinsic::sadd_sat:
1155 case Intrinsic::ssub_sat:
1156 case Intrinsic::uadd_sat:
1157 case Intrinsic::usub_sat: {
1158 auto LT = getTypeLegalizationCost(RetTy);
1159 if (ST->hasVInstructions() && LT.second.isVector()) {
1160 unsigned Op;
1161 switch (ICA.getID()) {
1162 case Intrinsic::sadd_sat:
1163 Op = RISCV::VSADD_VV;
1164 break;
1165 case Intrinsic::ssub_sat:
1166 Op = RISCV::VSSUBU_VV;
1167 break;
1168 case Intrinsic::uadd_sat:
1169 Op = RISCV::VSADDU_VV;
1170 break;
1171 case Intrinsic::usub_sat:
1172 Op = RISCV::VSSUBU_VV;
1173 break;
1174 }
1175 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1176 }
1177 break;
1178 }
1179 case Intrinsic::fma:
1180 case Intrinsic::fmuladd: {
1181 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1182 auto LT = getTypeLegalizationCost(RetTy);
1183 if (ST->hasVInstructions() && LT.second.isVector())
1184 return LT.first *
1185 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1186 break;
1187 }
1188 case Intrinsic::fabs: {
1189 auto LT = getTypeLegalizationCost(RetTy);
1190 if (ST->hasVInstructions() && LT.second.isVector()) {
1191 // lui a0, 8
1192 // addi a0, a0, -1
1193 // vsetvli a1, zero, e16, m1, ta, ma
1194 // vand.vx v8, v8, a0
1195 // f16 with zvfhmin and bf16 with zvfhbmin
1196 if (LT.second.getVectorElementType() == MVT::bf16 ||
1197 (LT.second.getVectorElementType() == MVT::f16 &&
1198 !ST->hasVInstructionsF16()))
1199 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1200 CostKind) +
1201 2;
1202 else
1203 return LT.first *
1204 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1205 }
1206 break;
1207 }
1208 case Intrinsic::sqrt: {
1209 auto LT = getTypeLegalizationCost(RetTy);
1210 if (ST->hasVInstructions() && LT.second.isVector()) {
1213 MVT ConvType = LT.second;
1214 MVT FsqrtType = LT.second;
1215 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1216 // will be spilt.
1217 if (LT.second.getVectorElementType() == MVT::bf16) {
1218 if (LT.second == MVT::nxv32bf16) {
1219 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1220 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1221 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1222 ConvType = MVT::nxv16f16;
1223 FsqrtType = MVT::nxv16f32;
1224 } else {
1225 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1226 FsqrtOp = {RISCV::VFSQRT_V};
1227 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1228 }
1229 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1230 !ST->hasVInstructionsF16()) {
1231 if (LT.second == MVT::nxv32f16) {
1232 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1233 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1234 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1235 ConvType = MVT::nxv16f16;
1236 FsqrtType = MVT::nxv16f32;
1237 } else {
1238 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1239 FsqrtOp = {RISCV::VFSQRT_V};
1240 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1241 }
1242 } else {
1243 FsqrtOp = {RISCV::VFSQRT_V};
1244 }
1245
1246 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1247 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1248 }
1249 break;
1250 }
1251 case Intrinsic::cttz:
1252 case Intrinsic::ctlz:
1253 case Intrinsic::ctpop: {
1254 auto LT = getTypeLegalizationCost(RetTy);
1255 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector()) {
1256 unsigned Op;
1257 switch (ICA.getID()) {
1258 case Intrinsic::cttz:
1259 Op = RISCV::VCTZ_V;
1260 break;
1261 case Intrinsic::ctlz:
1262 Op = RISCV::VCLZ_V;
1263 break;
1264 case Intrinsic::ctpop:
1265 Op = RISCV::VCPOP_V;
1266 break;
1267 }
1268 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1269 }
1270 break;
1271 }
1272 case Intrinsic::abs: {
1273 auto LT = getTypeLegalizationCost(RetTy);
1274 if (ST->hasVInstructions() && LT.second.isVector()) {
1275 // vrsub.vi v10, v8, 0
1276 // vmax.vv v8, v8, v10
1277 return LT.first *
1278 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1279 LT.second, CostKind);
1280 }
1281 break;
1282 }
1283 case Intrinsic::get_active_lane_mask: {
1284 if (ST->hasVInstructions()) {
1285 Type *ExpRetTy = VectorType::get(
1286 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1287 auto LT = getTypeLegalizationCost(ExpRetTy);
1288
1289 // vid.v v8 // considered hoisted
1290 // vsaddu.vx v8, v8, a0
1291 // vmsltu.vx v0, v8, a1
1292 return LT.first *
1293 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1294 LT.second, CostKind);
1295 }
1296 break;
1297 }
1298 // TODO: add more intrinsic
1299 case Intrinsic::stepvector: {
1300 auto LT = getTypeLegalizationCost(RetTy);
1301 // Legalisation of illegal types involves an `index' instruction plus
1302 // (LT.first - 1) vector adds.
1303 if (ST->hasVInstructions())
1304 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1305 (LT.first - 1) *
1306 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1307 return 1 + (LT.first - 1);
1308 }
1309 case Intrinsic::experimental_cttz_elts: {
1310 Type *ArgTy = ICA.getArgTypes()[0];
1311 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1312 if (getTLI()->shouldExpandCttzElements(ArgType))
1313 break;
1314 InstructionCost Cost = getRISCVInstructionCost(
1315 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1316
1317 // If zero_is_poison is false, then we will generate additional
1318 // cmp + select instructions to convert -1 to EVL.
1319 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1320 if (ICA.getArgs().size() > 1 &&
1321 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1322 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1324 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1326
1327 return Cost;
1328 }
1329 case Intrinsic::vp_rint: {
1330 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
1331 unsigned Cost = 5;
1332 auto LT = getTypeLegalizationCost(RetTy);
1333 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
1334 return Cost * LT.first;
1335 break;
1336 }
1337 case Intrinsic::vp_nearbyint: {
1338 // More one read and one write for fflags than vp_rint.
1339 unsigned Cost = 7;
1340 auto LT = getTypeLegalizationCost(RetTy);
1341 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
1342 return Cost * LT.first;
1343 break;
1344 }
1345 case Intrinsic::vp_ceil:
1346 case Intrinsic::vp_floor:
1347 case Intrinsic::vp_round:
1348 case Intrinsic::vp_roundeven:
1349 case Intrinsic::vp_roundtozero: {
1350 // Rounding with static rounding mode needs two more instructions to
1351 // swap/write FRM than vp_rint.
1352 unsigned Cost = 7;
1353 auto LT = getTypeLegalizationCost(RetTy);
1354 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
1355 if (TLI->isOperationCustom(VPISD, LT.second))
1356 return Cost * LT.first;
1357 break;
1358 }
1359 case Intrinsic::vp_fneg: {
1360 std::optional<unsigned> FOp =
1362 assert(FOp.has_value());
1363 return getArithmeticInstrCost(*FOp, ICA.getReturnType(), CostKind);
1364 break;
1365 }
1366 case Intrinsic::vp_select: {
1367 Intrinsic::ID IID = ICA.getID();
1368 std::optional<unsigned> FOp = VPIntrinsic::getFunctionalOpcodeForVP(IID);
1369 assert(FOp.has_value());
1370 return getCmpSelInstrCost(*FOp, ICA.getReturnType(), ICA.getArgTypes()[0],
1372 }
1373 case Intrinsic::vp_merge:
1374 return getCmpSelInstrCost(Instruction::Select, ICA.getReturnType(),
1376 CostKind);
1377 case Intrinsic::experimental_vp_splat: {
1378 auto LT = getTypeLegalizationCost(RetTy);
1379 // TODO: Lower i1 experimental_vp_splat
1380 if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1)
1382 return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint()
1383 ? RISCV::VFMV_V_F
1384 : RISCV::VMV_V_X,
1385 LT.second, CostKind);
1386 }
1387 case Intrinsic::experimental_vp_splice: {
1388 // To support type-based query from vectorizer, set the index to 0.
1389 // Note that index only change the cost from vslide.vx to vslide.vi and in
1390 // current implementations they have same costs.
1392 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1393 0, cast<VectorType>(ICA.getReturnType()));
1394 }
1395 }
1396
1397 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1398 if (auto LT = getTypeLegalizationCost(RetTy);
1399 LT.second.isVector()) {
1400 MVT EltTy = LT.second.getVectorElementType();
1401 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1402 ICA.getID(), EltTy))
1403 return LT.first * Entry->Cost;
1404 }
1405 }
1406
1408}
1409
1411 Type *Src,
1414 const Instruction *I) {
1415 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1416 if (!IsVectorType)
1417 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1418
1419 // FIXME: Need to compute legalizing cost for illegal types. The current
1420 // code handles only legal types and those which can be trivially
1421 // promoted to legal.
1422 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1423 Dst->getScalarSizeInBits() > ST->getELen())
1424 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1425
1426 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1427 assert(ISD && "Invalid opcode");
1428 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1429 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1430
1431 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1432 // The shared implementation doesn't model vector widening during legalization
1433 // and instead assumes scalarization. In order to scalarize an <N x i1>
1434 // vector, we need to extend/trunc to/from i8. If we don't special case
1435 // this, we can get an infinite recursion cycle.
1436 switch (ISD) {
1437 default:
1438 break;
1439 case ISD::SIGN_EXTEND:
1440 case ISD::ZERO_EXTEND:
1441 if (Src->getScalarSizeInBits() == 1) {
1442 // We do not use vsext/vzext to extend from mask vector.
1443 // Instead we use the following instructions to extend from mask vector:
1444 // vmv.v.i v8, 0
1445 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1446 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1447 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1448 DstLT.second, CostKind) +
1449 DstLT.first - 1;
1450 }
1451 break;
1452 case ISD::TRUNCATE:
1453 if (Dst->getScalarSizeInBits() == 1) {
1454 // We do not use several vncvt to truncate to mask vector. So we could
1455 // not use PowDiff to calculate it.
1456 // Instead we use the following instructions to truncate to mask vector:
1457 // vand.vi v8, v8, 1
1458 // vmsne.vi v0, v8, 0
1459 return SrcLT.first *
1460 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1461 SrcLT.second, CostKind) +
1462 SrcLT.first - 1;
1463 }
1464 break;
1465 };
1466
1467 // Our actual lowering for the case where a wider legal type is available
1468 // uses promotion to the wider type. This is reflected in the result of
1469 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1470 // scalarized if the legalized Src and Dst are not equal sized.
1471 const DataLayout &DL = this->getDataLayout();
1472 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1474 SrcLT.second.getSizeInBits()) ||
1476 DstLT.second.getSizeInBits()))
1477 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1478
1479 // The split cost is handled by the base getCastInstrCost
1480 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1481
1482 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1483 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1484 switch (ISD) {
1485 case ISD::SIGN_EXTEND:
1486 case ISD::ZERO_EXTEND: {
1487 if ((PowDiff < 1) || (PowDiff > 3))
1488 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1489 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1490 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1491 unsigned Op =
1492 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1493 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1494 }
1495 case ISD::TRUNCATE:
1496 case ISD::FP_EXTEND:
1497 case ISD::FP_ROUND: {
1498 // Counts of narrow/widen instructions.
1499 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1500 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1501
1502 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1503 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1504 : RISCV::VFNCVT_F_F_W;
1506 for (; SrcEltSize != DstEltSize;) {
1507 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1508 ? MVT::getIntegerVT(DstEltSize)
1509 : MVT::getFloatingPointVT(DstEltSize);
1510 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1511 DstEltSize =
1512 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1513 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1514 }
1515 return Cost;
1516 }
1517 case ISD::FP_TO_SINT:
1518 case ISD::FP_TO_UINT: {
1519 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1520 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1521 unsigned FWCVT =
1522 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1523 unsigned FNCVT =
1524 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1525 unsigned SrcEltSize = Src->getScalarSizeInBits();
1526 unsigned DstEltSize = Dst->getScalarSizeInBits();
1528 if ((SrcEltSize == 16) &&
1529 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1530 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1531 // pre-widening to f32 and then convert f32 to integer
1532 VectorType *VecF32Ty =
1533 VectorType::get(Type::getFloatTy(Dst->getContext()),
1534 cast<VectorType>(Dst)->getElementCount());
1535 std::pair<InstructionCost, MVT> VecF32LT =
1536 getTypeLegalizationCost(VecF32Ty);
1537 Cost +=
1538 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1539 VecF32LT.second, CostKind);
1540 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1541 return Cost;
1542 }
1543 if (DstEltSize == SrcEltSize)
1544 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1545 else if (DstEltSize > SrcEltSize)
1546 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1547 else { // (SrcEltSize > DstEltSize)
1548 // First do a narrowing conversion to an integer half the size, then
1549 // truncate if needed.
1550 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1551 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1552 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1553 if ((SrcEltSize / 2) > DstEltSize) {
1554 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1555 Cost +=
1556 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1557 }
1558 }
1559 return Cost;
1560 }
1561 case ISD::SINT_TO_FP:
1562 case ISD::UINT_TO_FP: {
1563 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1564 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1565 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1566 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1567 unsigned SrcEltSize = Src->getScalarSizeInBits();
1568 unsigned DstEltSize = Dst->getScalarSizeInBits();
1569
1571 if ((DstEltSize == 16) &&
1572 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1573 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1574 // it is converted to f32 and then converted to f16
1575 VectorType *VecF32Ty =
1576 VectorType::get(Type::getFloatTy(Dst->getContext()),
1577 cast<VectorType>(Dst)->getElementCount());
1578 std::pair<InstructionCost, MVT> VecF32LT =
1579 getTypeLegalizationCost(VecF32Ty);
1580 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1581 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1582 DstLT.second, CostKind);
1583 return Cost;
1584 }
1585
1586 if (DstEltSize == SrcEltSize)
1587 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1588 else if (DstEltSize > SrcEltSize) {
1589 if ((DstEltSize / 2) > SrcEltSize) {
1590 VectorType *VecTy =
1591 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1592 cast<VectorType>(Dst)->getElementCount());
1593 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1594 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1595 }
1596 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1597 } else
1598 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1599 return Cost;
1600 }
1601 }
1602 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1603}
1604
1605unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1606 if (isa<ScalableVectorType>(Ty)) {
1607 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1608 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1609 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1610 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1611 }
1612 return cast<FixedVectorType>(Ty)->getNumElements();
1613}
1614
1617 FastMathFlags FMF,
1619 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1620 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1621
1622 // Skip if scalar size of Ty is bigger than ELEN.
1623 if (Ty->getScalarSizeInBits() > ST->getELen())
1624 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1625
1626 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1627 if (Ty->getElementType()->isIntegerTy(1)) {
1628 // SelectionDAGBuilder does following transforms:
1629 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1630 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1631 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1632 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1633 else
1634 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1635 }
1636
1637 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1639 InstructionCost ExtraCost = 0;
1640 switch (IID) {
1641 case Intrinsic::maximum:
1642 if (FMF.noNaNs()) {
1643 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1644 } else {
1645 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1646 RISCV::VFMV_F_S};
1647 // Cost of Canonical Nan + branch
1648 // lui a0, 523264
1649 // fmv.w.x fa0, a0
1650 Type *DstTy = Ty->getScalarType();
1651 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1652 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1653 ExtraCost = 1 +
1654 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1656 getCFInstrCost(Instruction::Br, CostKind);
1657 }
1658 break;
1659
1660 case Intrinsic::minimum:
1661 if (FMF.noNaNs()) {
1662 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1663 } else {
1664 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1665 RISCV::VFMV_F_S};
1666 // Cost of Canonical Nan + branch
1667 // lui a0, 523264
1668 // fmv.w.x fa0, a0
1669 Type *DstTy = Ty->getScalarType();
1670 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1671 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1672 ExtraCost = 1 +
1673 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1675 getCFInstrCost(Instruction::Br, CostKind);
1676 }
1677 break;
1678 }
1679 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1680 }
1681
1682 // IR Reduction is composed by one rvv reduction instruction and vmv
1683 unsigned SplitOp;
1685 switch (IID) {
1686 default:
1687 llvm_unreachable("Unsupported intrinsic");
1688 case Intrinsic::smax:
1689 SplitOp = RISCV::VMAX_VV;
1690 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1691 break;
1692 case Intrinsic::smin:
1693 SplitOp = RISCV::VMIN_VV;
1694 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1695 break;
1696 case Intrinsic::umax:
1697 SplitOp = RISCV::VMAXU_VV;
1698 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1699 break;
1700 case Intrinsic::umin:
1701 SplitOp = RISCV::VMINU_VV;
1702 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1703 break;
1704 case Intrinsic::maxnum:
1705 SplitOp = RISCV::VFMAX_VV;
1706 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1707 break;
1708 case Intrinsic::minnum:
1709 SplitOp = RISCV::VFMIN_VV;
1710 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1711 break;
1712 }
1713 // Add a cost for data larger than LMUL8
1714 InstructionCost SplitCost =
1715 (LT.first > 1) ? (LT.first - 1) *
1716 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1717 : 0;
1718 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1719}
1720
1723 std::optional<FastMathFlags> FMF,
1725 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1726 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1727
1728 // Skip if scalar size of Ty is bigger than ELEN.
1729 if (Ty->getScalarSizeInBits() > ST->getELen())
1730 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1731
1732 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1733 assert(ISD && "Invalid opcode");
1734
1735 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1736 ISD != ISD::FADD)
1737 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1738
1739 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1740 Type *ElementTy = Ty->getElementType();
1741 if (ElementTy->isIntegerTy(1)) {
1742 // Example sequences:
1743 // vfirst.m a0, v0
1744 // seqz a0, a0
1745 if (LT.second == MVT::v1i1)
1746 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
1747 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1749
1750 if (ISD == ISD::AND) {
1751 // Example sequences:
1752 // vmand.mm v8, v9, v8 ; needed every time type is split
1753 // vmnot.m v8, v0 ; alias for vmnand
1754 // vcpop.m a0, v8
1755 // seqz a0, a0
1756
1757 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
1758 // For LMUL <= 8, there is no splitting,
1759 // the sequences are vmnot, vcpop and seqz.
1760 // When LMUL > 8 and split = 1,
1761 // the sequences are vmnand, vcpop and seqz.
1762 // When LMUL > 8 and split > 1,
1763 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
1764 return ((LT.first > 2) ? (LT.first - 2) : 0) *
1765 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
1766 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
1767 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1768 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1770 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
1771 // Example sequences:
1772 // vsetvli a0, zero, e8, mf8, ta, ma
1773 // vmxor.mm v8, v0, v8 ; needed every time type is split
1774 // vcpop.m a0, v8
1775 // andi a0, a0, 1
1776 return (LT.first - 1) *
1777 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
1778 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
1779 } else {
1780 assert(ISD == ISD::OR);
1781 // Example sequences:
1782 // vsetvli a0, zero, e8, mf8, ta, ma
1783 // vmor.mm v8, v9, v8 ; needed every time type is split
1784 // vcpop.m a0, v0
1785 // snez a0, a0
1786 return (LT.first - 1) *
1787 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
1788 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1789 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1791 }
1792 }
1793
1794 // IR Reduction of or/and is composed by one vmv and one rvv reduction
1795 // instruction, and others is composed by two vmv and one rvv reduction
1796 // instruction
1797 unsigned SplitOp;
1799 switch (ISD) {
1800 case ISD::ADD:
1801 SplitOp = RISCV::VADD_VV;
1802 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1803 break;
1804 case ISD::OR:
1805 SplitOp = RISCV::VOR_VV;
1806 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
1807 break;
1808 case ISD::XOR:
1809 SplitOp = RISCV::VXOR_VV;
1810 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1811 break;
1812 case ISD::AND:
1813 SplitOp = RISCV::VAND_VV;
1814 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
1815 break;
1816 case ISD::FADD:
1817 // We can't promote f16/bf16 fadd reductions.
1818 if ((LT.second.getVectorElementType() == MVT::f16 &&
1819 !ST->hasVInstructionsF16()) ||
1820 LT.second.getVectorElementType() == MVT::bf16)
1821 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1823 Opcodes.push_back(RISCV::VFMV_S_F);
1824 for (unsigned i = 0; i < LT.first.getValue(); i++)
1825 Opcodes.push_back(RISCV::VFREDOSUM_VS);
1826 Opcodes.push_back(RISCV::VFMV_F_S);
1827 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1828 }
1829 SplitOp = RISCV::VFADD_VV;
1830 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
1831 break;
1832 }
1833 // Add a cost for data larger than LMUL8
1834 InstructionCost SplitCost =
1835 (LT.first > 1) ? (LT.first - 1) *
1836 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1837 : 0;
1838 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1839}
1840
1842 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1844 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1845 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1846 FMF, CostKind);
1847
1848 // Skip if scalar size of ResTy is bigger than ELEN.
1849 if (ResTy->getScalarSizeInBits() > ST->getELen())
1850 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1851 FMF, CostKind);
1852
1853 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1854 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1855 FMF, CostKind);
1856
1857 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1858
1859 if (IsUnsigned && Opcode == Instruction::Add &&
1860 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
1861 // Represent vector_reduce_add(ZExt(<n x i1>)) as
1862 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
1863 return LT.first *
1864 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
1865 }
1866
1867 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1868 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1869 FMF, CostKind);
1870
1871 return (LT.first - 1) +
1872 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1873}
1874
1876 TTI::OperandValueInfo OpInfo,
1878 assert(OpInfo.isConstant() && "non constant operand?");
1879 if (!isa<VectorType>(Ty))
1880 // FIXME: We need to account for immediate materialization here, but doing
1881 // a decent job requires more knowledge about the immediate than we
1882 // currently have here.
1883 return 0;
1884
1885 if (OpInfo.isUniform())
1886 // vmv.v.i, vmv.v.x, or vfmv.v.f
1887 // We ignore the cost of the scalar constant materialization to be consistent
1888 // with how we treat scalar constants themselves just above.
1889 return 1;
1890
1891 return getConstantPoolLoadCost(Ty, CostKind);
1892}
1893
1894
1896 MaybeAlign Alignment,
1897 unsigned AddressSpace,
1899 TTI::OperandValueInfo OpInfo,
1900 const Instruction *I) {
1901 EVT VT = TLI->getValueType(DL, Src, true);
1902 // Type legalization can't handle structs
1903 if (VT == MVT::Other)
1904 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1905 CostKind, OpInfo, I);
1906
1908 if (Opcode == Instruction::Store && OpInfo.isConstant())
1909 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1910
1911 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1912
1913 InstructionCost BaseCost = [&]() {
1914 InstructionCost Cost = LT.first;
1916 return Cost;
1917
1918 // Our actual lowering for the case where a wider legal type is available
1919 // uses the a VL predicated load on the wider type. This is reflected in
1920 // the result of getTypeLegalizationCost, but BasicTTI assumes the
1921 // widened cases are scalarized.
1922 const DataLayout &DL = this->getDataLayout();
1923 if (Src->isVectorTy() && LT.second.isVector() &&
1925 LT.second.getSizeInBits()))
1926 return Cost;
1927
1928 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1929 CostKind, OpInfo, I);
1930 }();
1931
1932 // Assume memory ops cost scale with the number of vector registers
1933 // possible accessed by the instruction. Note that BasicTTI already
1934 // handles the LT.first term for us.
1935 if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
1936 BaseCost *= TLI->getLMULCost(LT.second);
1937 return Cost + BaseCost;
1938
1939}
1940
1942 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
1944 TTI::OperandValueInfo Op2Info, const Instruction *I) {
1946 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1947 Op1Info, Op2Info, I);
1948
1949 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1950 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1951 Op1Info, Op2Info, I);
1952
1953 // Skip if scalar size of ValTy is bigger than ELEN.
1954 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1955 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1956 Op1Info, Op2Info, I);
1957
1958 auto GetConstantMatCost =
1959 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
1960 if (OpInfo.isUniform())
1961 // We return 0 we currently ignore the cost of materializing scalar
1962 // constants in GPRs.
1963 return 0;
1964
1965 return getConstantPoolLoadCost(ValTy, CostKind);
1966 };
1967
1968 InstructionCost ConstantMatCost;
1969 if (Op1Info.isConstant())
1970 ConstantMatCost += GetConstantMatCost(Op1Info);
1971 if (Op2Info.isConstant())
1972 ConstantMatCost += GetConstantMatCost(Op2Info);
1973
1974 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1975 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1976 if (CondTy->isVectorTy()) {
1977 if (ValTy->getScalarSizeInBits() == 1) {
1978 // vmandn.mm v8, v8, v9
1979 // vmand.mm v9, v0, v9
1980 // vmor.mm v0, v9, v8
1981 return ConstantMatCost +
1982 LT.first *
1983 getRISCVInstructionCost(
1984 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1985 LT.second, CostKind);
1986 }
1987 // vselect and max/min are supported natively.
1988 return ConstantMatCost +
1989 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
1990 CostKind);
1991 }
1992
1993 if (ValTy->getScalarSizeInBits() == 1) {
1994 // vmv.v.x v9, a0
1995 // vmsne.vi v9, v9, 0
1996 // vmandn.mm v8, v8, v9
1997 // vmand.mm v9, v0, v9
1998 // vmor.mm v0, v9, v8
1999 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2000 return ConstantMatCost +
2001 LT.first *
2002 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2003 InterimVT, CostKind) +
2004 LT.first * getRISCVInstructionCost(
2005 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2006 LT.second, CostKind);
2007 }
2008
2009 // vmv.v.x v10, a0
2010 // vmsne.vi v0, v10, 0
2011 // vmerge.vvm v8, v9, v8, v0
2012 return ConstantMatCost +
2013 LT.first * getRISCVInstructionCost(
2014 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2015 LT.second, CostKind);
2016 }
2017
2018 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2019 CmpInst::isIntPredicate(VecPred)) {
2020 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2021 // provided they incur the same cost across all implementations
2022 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2023 LT.second,
2024 CostKind);
2025 }
2026
2027 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2028 CmpInst::isFPPredicate(VecPred)) {
2029
2030 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2031 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2032 return ConstantMatCost +
2033 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2034
2035 // If we do not support the input floating point vector type, use the base
2036 // one which will calculate as:
2037 // ScalarizeCost + Num * Cost for fixed vector,
2038 // InvalidCost for scalable vector.
2039 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2040 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2041 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2042 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2043 Op1Info, Op2Info, I);
2044
2045 // Assuming vector fp compare and mask instructions are all the same cost
2046 // until a need arises to differentiate them.
2047 switch (VecPred) {
2048 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2049 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2050 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2051 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2052 return ConstantMatCost +
2053 LT.first * getRISCVInstructionCost(
2054 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2055 LT.second, CostKind);
2056
2057 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2058 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2059 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2060 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2061 return ConstantMatCost +
2062 LT.first *
2063 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2064 LT.second, CostKind);
2065
2066 case CmpInst::FCMP_OEQ: // vmfeq.vv
2067 case CmpInst::FCMP_OGT: // vmflt.vv
2068 case CmpInst::FCMP_OGE: // vmfle.vv
2069 case CmpInst::FCMP_OLT: // vmflt.vv
2070 case CmpInst::FCMP_OLE: // vmfle.vv
2071 case CmpInst::FCMP_UNE: // vmfne.vv
2072 return ConstantMatCost +
2073 LT.first *
2074 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2075 default:
2076 break;
2077 }
2078 }
2079
2080 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2081 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2082 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2083 // be (0 + select instr cost).
2084 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2085 ValTy->isIntegerTy() && !I->user_empty()) {
2086 if (all_of(I->users(), [&](const User *U) {
2087 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2088 U->getType()->isIntegerTy() &&
2089 !isa<ConstantData>(U->getOperand(1)) &&
2090 !isa<ConstantData>(U->getOperand(2));
2091 }))
2092 return 0;
2093 }
2094
2095 // TODO: Add cost for scalar type.
2096
2097 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2098 Op1Info, Op2Info, I);
2099}
2100
2103 const Instruction *I) {
2105 return Opcode == Instruction::PHI ? 0 : 1;
2106 // Branches are assumed to be predicted.
2107 return 0;
2108}
2109
2112 unsigned Index, Value *Op0,
2113 Value *Op1) {
2114 assert(Val->isVectorTy() && "This must be a vector type");
2115
2116 if (Opcode != Instruction::ExtractElement &&
2117 Opcode != Instruction::InsertElement)
2118 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
2119
2120 // Legalize the type.
2121 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2122
2123 // This type is legalized to a scalar type.
2124 if (!LT.second.isVector()) {
2125 auto *FixedVecTy = cast<FixedVectorType>(Val);
2126 // If Index is a known constant, cost is zero.
2127 if (Index != -1U)
2128 return 0;
2129 // Extract/InsertElement with non-constant index is very costly when
2130 // scalarized; estimate cost of loads/stores sequence via the stack:
2131 // ExtractElement cost: store vector to stack, load scalar;
2132 // InsertElement cost: store vector to stack, store scalar, load vector.
2133 Type *ElemTy = FixedVecTy->getElementType();
2134 auto NumElems = FixedVecTy->getNumElements();
2135 auto Align = DL.getPrefTypeAlign(ElemTy);
2136 InstructionCost LoadCost =
2137 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2138 InstructionCost StoreCost =
2139 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2140 return Opcode == Instruction::ExtractElement
2141 ? StoreCost * NumElems + LoadCost
2142 : (StoreCost + LoadCost) * NumElems + StoreCost;
2143 }
2144
2145 // For unsupported scalable vector.
2146 if (LT.second.isScalableVector() && !LT.first.isValid())
2147 return LT.first;
2148
2149 // Mask vector extract/insert is expanded via e8.
2150 if (Val->getScalarSizeInBits() == 1) {
2151 VectorType *WideTy =
2153 cast<VectorType>(Val)->getElementCount());
2154 if (Opcode == Instruction::ExtractElement) {
2155 InstructionCost ExtendCost
2156 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2158 InstructionCost ExtractCost
2159 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2160 return ExtendCost + ExtractCost;
2161 }
2162 InstructionCost ExtendCost
2163 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2165 InstructionCost InsertCost
2166 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2167 InstructionCost TruncCost
2168 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2170 return ExtendCost + InsertCost + TruncCost;
2171 }
2172
2173
2174 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2175 // and vslideup + vmv.s.x to insert element to vector.
2176 unsigned BaseCost = 1;
2177 // When insertelement we should add the index with 1 as the input of vslideup.
2178 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2179
2180 if (Index != -1U) {
2181 // The type may be split. For fixed-width vectors we can normalize the
2182 // index to the new type.
2183 if (LT.second.isFixedLengthVector()) {
2184 unsigned Width = LT.second.getVectorNumElements();
2185 Index = Index % Width;
2186 }
2187
2188 // If exact VLEN is known, we will insert/extract into the appropriate
2189 // subvector with no additional subvector insert/extract cost.
2190 if (auto VLEN = ST->getRealVLen()) {
2191 unsigned EltSize = LT.second.getScalarSizeInBits();
2192 unsigned M1Max = *VLEN / EltSize;
2193 Index = Index % M1Max;
2194 }
2195
2196 // We could extract/insert the first element without vslidedown/vslideup.
2197 if (Index == 0)
2198 SlideCost = 0;
2199 else if (Opcode == Instruction::InsertElement)
2200 SlideCost = 1; // With a constant index, we do not need to use addi.
2201 }
2202
2203 // When the vector needs to split into multiple register groups and the index
2204 // exceeds single vector register group, we need to insert/extract the element
2205 // via stack.
2206 if (LT.first > 1 &&
2207 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2208 LT.second.isScalableVector()))) {
2209 Type *ScalarType = Val->getScalarType();
2210 Align VecAlign = DL.getPrefTypeAlign(Val);
2211 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2212 // Extra addi for unknown index.
2213 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2214
2215 // Store all split vectors into stack and load the target element.
2216 if (Opcode == Instruction::ExtractElement)
2217 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2218 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2219 CostKind) +
2220 IdxCost;
2221
2222 // Store all split vectors into stack and store the target element and load
2223 // vectors back.
2224 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2225 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2226 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2227 CostKind) +
2228 IdxCost;
2229 }
2230
2231 // Extract i64 in the target that has XLEN=32 need more instruction.
2232 if (Val->getScalarType()->isIntegerTy() &&
2233 ST->getXLen() < Val->getScalarSizeInBits()) {
2234 // For extractelement, we need the following instructions:
2235 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2236 // vslidedown.vx v8, v8, a0
2237 // vmv.x.s a0, v8
2238 // li a1, 32
2239 // vsrl.vx v8, v8, a1
2240 // vmv.x.s a1, v8
2241
2242 // For insertelement, we need the following instructions:
2243 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2244 // vmv.v.i v12, 0
2245 // vslide1up.vx v16, v12, a1
2246 // vslide1up.vx v12, v16, a0
2247 // addi a0, a2, 1
2248 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2249 // vslideup.vx v8, v12, a2
2250
2251 // TODO: should we count these special vsetvlis?
2252 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2253 }
2254 return BaseCost + SlideCost;
2255}
2256
2258 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2260 ArrayRef<const Value *> Args, const Instruction *CxtI) {
2261
2262 // TODO: Handle more cost kinds.
2264 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2265 Args, CxtI);
2266
2267 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2268 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2269 Args, CxtI);
2270
2271 // Skip if scalar size of Ty is bigger than ELEN.
2272 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2273 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2274 Args, CxtI);
2275
2276 // Legalize the type.
2277 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2278
2279 // TODO: Handle scalar type.
2280 if (!LT.second.isVector())
2281 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2282 Args, CxtI);
2283
2284 // f16 with zvfhmin and bf16 will be promoted to f32.
2285 // FIXME: nxv32[b]f16 will be custom lowered and split.
2286 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2287 InstructionCost CastCost = 0;
2288 if ((LT.second.getVectorElementType() == MVT::f16 ||
2289 LT.second.getVectorElementType() == MVT::bf16) &&
2290 TLI->getOperationAction(ISDOpcode, LT.second) ==
2292 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2293 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2294 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2295 // Add cost of extending arguments
2296 CastCost += LT.first * Args.size() *
2297 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2299 // Add cost of truncating result
2300 CastCost +=
2301 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2303 // Compute cost of op in promoted type
2304 LT.second = PromotedVT;
2305 }
2306
2307 auto getConstantMatCost =
2308 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2309 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2310 // Two sub-cases:
2311 // * Has a 5 bit immediate operand which can be splatted.
2312 // * Has a larger immediate which must be materialized in scalar register
2313 // We return 0 for both as we currently ignore the cost of materializing
2314 // scalar constants in GPRs.
2315 return 0;
2316
2317 return getConstantPoolLoadCost(Ty, CostKind);
2318 };
2319
2320 // Add the cost of materializing any constant vectors required.
2321 InstructionCost ConstantMatCost = 0;
2322 if (Op1Info.isConstant())
2323 ConstantMatCost += getConstantMatCost(0, Op1Info);
2324 if (Op2Info.isConstant())
2325 ConstantMatCost += getConstantMatCost(1, Op2Info);
2326
2327 unsigned Op;
2328 switch (ISDOpcode) {
2329 case ISD::ADD:
2330 case ISD::SUB:
2331 Op = RISCV::VADD_VV;
2332 break;
2333 case ISD::SHL:
2334 case ISD::SRL:
2335 case ISD::SRA:
2336 Op = RISCV::VSLL_VV;
2337 break;
2338 case ISD::AND:
2339 case ISD::OR:
2340 case ISD::XOR:
2341 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2342 break;
2343 case ISD::MUL:
2344 case ISD::MULHS:
2345 case ISD::MULHU:
2346 Op = RISCV::VMUL_VV;
2347 break;
2348 case ISD::SDIV:
2349 case ISD::UDIV:
2350 Op = RISCV::VDIV_VV;
2351 break;
2352 case ISD::SREM:
2353 case ISD::UREM:
2354 Op = RISCV::VREM_VV;
2355 break;
2356 case ISD::FADD:
2357 case ISD::FSUB:
2358 Op = RISCV::VFADD_VV;
2359 break;
2360 case ISD::FMUL:
2361 Op = RISCV::VFMUL_VV;
2362 break;
2363 case ISD::FDIV:
2364 Op = RISCV::VFDIV_VV;
2365 break;
2366 case ISD::FNEG:
2367 Op = RISCV::VFSGNJN_VV;
2368 break;
2369 default:
2370 // Assuming all other instructions have the same cost until a need arises to
2371 // differentiate them.
2372 return CastCost + ConstantMatCost +
2373 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2374 Args, CxtI);
2375 }
2376
2377 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2378 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2379 // ops are twice as expensive as integer ops. Do the same for vectors so
2380 // scalar floating point ops aren't cheaper than their vector equivalents.
2381 if (Ty->isFPOrFPVectorTy())
2382 InstrCost *= 2;
2383 return CastCost + ConstantMatCost + LT.first * InstrCost;
2384}
2385
2386// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2388 ArrayRef<const Value *> Ptrs, const Value *Base,
2389 const TTI::PointersChainInfo &Info, Type *AccessTy,
2392 // In the basic model we take into account GEP instructions only
2393 // (although here can come alloca instruction, a value, constants and/or
2394 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2395 // pointer). Typically, if Base is a not a GEP-instruction and all the
2396 // pointers are relative to the same base address, all the rest are
2397 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2398 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2399 // any their index is a non-const.
2400 // If no known dependencies between the pointers cost is calculated as a sum
2401 // of costs of GEP instructions.
2402 for (auto [I, V] : enumerate(Ptrs)) {
2403 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2404 if (!GEP)
2405 continue;
2406 if (Info.isSameBase() && V != Base) {
2407 if (GEP->hasAllConstantIndices())
2408 continue;
2409 // If the chain is unit-stride and BaseReg + stride*i is a legal
2410 // addressing mode, then presume the base GEP is sitting around in a
2411 // register somewhere and check if we can fold the offset relative to
2412 // it.
2413 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2414 if (Info.isUnitStride() &&
2415 isLegalAddressingMode(AccessTy,
2416 /* BaseGV */ nullptr,
2417 /* BaseOffset */ Stride * I,
2418 /* HasBaseReg */ true,
2419 /* Scale */ 0,
2420 GEP->getType()->getPointerAddressSpace()))
2421 continue;
2422 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2423 {TTI::OK_AnyValue, TTI::OP_None},
2424 {TTI::OK_AnyValue, TTI::OP_None}, {});
2425 } else {
2426 SmallVector<const Value *> Indices(GEP->indices());
2427 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2428 Indices, AccessTy, CostKind);
2429 }
2430 }
2431 return Cost;
2432}
2433
2437 // TODO: More tuning on benchmarks and metrics with changes as needed
2438 // would apply to all settings below to enable performance.
2439
2440
2441 if (ST->enableDefaultUnroll())
2442 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2443
2444 // Enable Upper bound unrolling universally, not dependent upon the conditions
2445 // below.
2446 UP.UpperBound = true;
2447
2448 // Disable loop unrolling for Oz and Os.
2449 UP.OptSizeThreshold = 0;
2451 if (L->getHeader()->getParent()->hasOptSize())
2452 return;
2453
2454 SmallVector<BasicBlock *, 4> ExitingBlocks;
2455 L->getExitingBlocks(ExitingBlocks);
2456 LLVM_DEBUG(dbgs() << "Loop has:\n"
2457 << "Blocks: " << L->getNumBlocks() << "\n"
2458 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2459
2460 // Only allow another exit other than the latch. This acts as an early exit
2461 // as it mirrors the profitability calculation of the runtime unroller.
2462 if (ExitingBlocks.size() > 2)
2463 return;
2464
2465 // Limit the CFG of the loop body for targets with a branch predictor.
2466 // Allowing 4 blocks permits if-then-else diamonds in the body.
2467 if (L->getNumBlocks() > 4)
2468 return;
2469
2470 // Don't unroll vectorized loops, including the remainder loop
2471 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2472 return;
2473
2474 // Scan the loop: don't unroll loops with calls as this could prevent
2475 // inlining.
2477 for (auto *BB : L->getBlocks()) {
2478 for (auto &I : *BB) {
2479 // Initial setting - Don't unroll loops containing vectorized
2480 // instructions.
2481 if (I.getType()->isVectorTy())
2482 return;
2483
2484 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2485 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2486 if (!isLoweredToCall(F))
2487 continue;
2488 }
2489 return;
2490 }
2491
2492 SmallVector<const Value *> Operands(I.operand_values());
2495 }
2496 }
2497
2498 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2499
2500 UP.Partial = true;
2501 UP.Runtime = true;
2502 UP.UnrollRemainder = true;
2503 UP.UnrollAndJam = true;
2504
2505 // Force unrolling small loops can be very useful because of the branch
2506 // taken cost of the backedge.
2507 if (Cost < 12)
2508 UP.Force = true;
2509}
2510
2514}
2515
2517 if (Ty->isVectorTy()) {
2518 // f16 with only zvfhmin and bf16 will be promoted to f32
2519 Type *EltTy = cast<VectorType>(Ty)->getElementType();
2520 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
2521 EltTy->isBFloatTy())
2523 cast<VectorType>(Ty));
2524
2526 if (Size.isScalable() && ST->hasVInstructions())
2527 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
2528
2530 return divideCeil(Size, ST->getRealMinVLen());
2531 }
2532
2533 return BaseT::getRegUsageForType(Ty);
2534}
2535
2536unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2537 if (SLPMaxVF.getNumOccurrences())
2538 return SLPMaxVF;
2539
2540 // Return how many elements can fit in getRegisterBitwidth. This is the
2541 // same routine as used in LoopVectorizer. We should probably be
2542 // accounting for whether we actually have instructions with the right
2543 // lane type, but we don't have enough information to do that without
2544 // some additional plumbing which hasn't been justified yet.
2545 TypeSize RegWidth =
2547 // If no vector registers, or absurd element widths, disable
2548 // vectorization by returning 1.
2549 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
2550}
2551
2554 ScalarEvolution *SE) const {
2555 if (ST->hasVendorXCVmem() && !ST->is64Bit())
2556 return TTI::AMK_PostIndexed;
2557
2559}
2560
2562 const TargetTransformInfo::LSRCost &C2) {
2563 // RISC-V specific here are "instruction number 1st priority".
2564 // If we need to emit adds inside the loop to add up base registers, then
2565 // we need at least one extra temporary register.
2566 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
2567 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
2568 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
2569 C1.NumIVMuls, C1.NumBaseAdds,
2570 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
2571 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
2572 C2.NumIVMuls, C2.NumBaseAdds,
2573 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
2574}
2575
2577 auto *VTy = dyn_cast<VectorType>(DataTy);
2578 if (!VTy || VTy->isScalableTy())
2579 return false;
2580
2581 if (!isLegalMaskedLoadStore(DataTy, Alignment))
2582 return false;
2583
2584 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
2585 // scalarize these types with LMUL >= maximum fixed-length LMUL.
2586 if (VTy->getElementType()->isIntegerTy(8))
2587 if (VTy->getElementCount().getFixedValue() > 256)
2588 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
2590 return true;
2591}
2592
2594 auto *VTy = dyn_cast<VectorType>(DataTy);
2595 if (!VTy || VTy->isScalableTy())
2596 return false;
2597
2598 if (!isLegalMaskedLoadStore(DataTy, Alignment))
2599 return false;
2600 return true;
2601}
2602
2603/// See if \p I should be considered for address type promotion. We check if \p
2604/// I is a sext with right type and used in memory accesses. If it used in a
2605/// "complex" getelementptr, we allow it to be promoted without finding other
2606/// sext instructions that sign extended the same initial value. A getelementptr
2607/// is considered as "complex" if it has more than 2 operands.
2609 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
2610 bool Considerable = false;
2611 AllowPromotionWithoutCommonHeader = false;
2612 if (!isa<SExtInst>(&I))
2613 return false;
2614 Type *ConsideredSExtType =
2615 Type::getInt64Ty(I.getParent()->getParent()->getContext());
2616 if (I.getType() != ConsideredSExtType)
2617 return false;
2618 // See if the sext is the one with the right type and used in at least one
2619 // GetElementPtrInst.
2620 for (const User *U : I.users()) {
2621 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2622 Considerable = true;
2623 // A getelementptr is considered as "complex" if it has more than 2
2624 // operands. We will promote a SExt used in such complex GEP as we
2625 // expect some computation to be merged if they are done on 64 bits.
2626 if (GEPInst->getNumOperands() > 2) {
2627 AllowPromotionWithoutCommonHeader = true;
2628 break;
2629 }
2630 }
2631 }
2632 return Considerable;
2633}
2634
2635bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
2636 switch (Opcode) {
2637 case Instruction::Add:
2638 case Instruction::Sub:
2639 case Instruction::Mul:
2640 case Instruction::And:
2641 case Instruction::Or:
2642 case Instruction::Xor:
2643 case Instruction::FAdd:
2644 case Instruction::FSub:
2645 case Instruction::FMul:
2646 case Instruction::FDiv:
2647 case Instruction::ICmp:
2648 case Instruction::FCmp:
2649 return true;
2650 case Instruction::Shl:
2651 case Instruction::LShr:
2652 case Instruction::AShr:
2653 case Instruction::UDiv:
2654 case Instruction::SDiv:
2655 case Instruction::URem:
2656 case Instruction::SRem:
2657 case Instruction::Select:
2658 return Operand == 1;
2659 default:
2660 return false;
2661 }
2662}
2663
2665 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
2666 return false;
2667
2668 if (canSplatOperand(I->getOpcode(), Operand))
2669 return true;
2670
2671 auto *II = dyn_cast<IntrinsicInst>(I);
2672 if (!II)
2673 return false;
2674
2675 switch (II->getIntrinsicID()) {
2676 case Intrinsic::fma:
2677 case Intrinsic::vp_fma:
2678 case Intrinsic::fmuladd:
2679 case Intrinsic::vp_fmuladd:
2680 return Operand == 0 || Operand == 1;
2681 case Intrinsic::vp_shl:
2682 case Intrinsic::vp_lshr:
2683 case Intrinsic::vp_ashr:
2684 case Intrinsic::vp_udiv:
2685 case Intrinsic::vp_sdiv:
2686 case Intrinsic::vp_urem:
2687 case Intrinsic::vp_srem:
2688 case Intrinsic::ssub_sat:
2689 case Intrinsic::vp_ssub_sat:
2690 case Intrinsic::usub_sat:
2691 case Intrinsic::vp_usub_sat:
2692 case Intrinsic::vp_select:
2693 return Operand == 1;
2694 // These intrinsics are commutative.
2695 case Intrinsic::vp_add:
2696 case Intrinsic::vp_mul:
2697 case Intrinsic::vp_and:
2698 case Intrinsic::vp_or:
2699 case Intrinsic::vp_xor:
2700 case Intrinsic::vp_fadd:
2701 case Intrinsic::vp_fmul:
2702 case Intrinsic::vp_icmp:
2703 case Intrinsic::vp_fcmp:
2704 case Intrinsic::smin:
2705 case Intrinsic::vp_smin:
2706 case Intrinsic::umin:
2707 case Intrinsic::vp_umin:
2708 case Intrinsic::smax:
2709 case Intrinsic::vp_smax:
2710 case Intrinsic::umax:
2711 case Intrinsic::vp_umax:
2712 case Intrinsic::sadd_sat:
2713 case Intrinsic::vp_sadd_sat:
2714 case Intrinsic::uadd_sat:
2715 case Intrinsic::vp_uadd_sat:
2716 // These intrinsics have 'vr' versions.
2717 case Intrinsic::vp_sub:
2718 case Intrinsic::vp_fsub:
2719 case Intrinsic::vp_fdiv:
2720 return Operand == 0 || Operand == 1;
2721 default:
2722 return false;
2723 }
2724}
2725
2726/// Check if sinking \p I's operands to I's basic block is profitable, because
2727/// the operands can be folded into a target instruction, e.g.
2728/// splats of scalars can fold into vector instructions.
2730 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
2731 using namespace llvm::PatternMatch;
2732
2733 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
2734 return false;
2735
2736 // Don't sink splat operands if the target prefers it. Some targets requires
2737 // S2V transfer buffers and we can run out of them copying the same value
2738 // repeatedly.
2739 // FIXME: It could still be worth doing if it would improve vector register
2740 // pressure and prevent a vector spill.
2741 if (!ST->sinkSplatOperands())
2742 return false;
2743
2744 for (auto OpIdx : enumerate(I->operands())) {
2745 if (!canSplatOperand(I, OpIdx.index()))
2746 continue;
2747
2748 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
2749 // Make sure we are not already sinking this operand
2750 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
2751 continue;
2752
2753 // We are looking for a splat that can be sunk.
2755 m_Undef(), m_ZeroMask())))
2756 continue;
2757
2758 // Don't sink i1 splats.
2759 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
2760 continue;
2761
2762 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
2763 // and vector registers
2764 for (Use &U : Op->uses()) {
2765 Instruction *Insn = cast<Instruction>(U.getUser());
2766 if (!canSplatOperand(Insn, U.getOperandNo()))
2767 return false;
2768 }
2769
2770 Use *InsertEltUse = &Op->getOperandUse(0);
2771 // Sink any fpexts since they might be used in a widening fp pattern.
2772 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
2773 if (isa<FPExtInst>(InsertElt->getOperand(1)))
2774 Ops.push_back(&InsertElt->getOperandUse(1));
2775 Ops.push_back(InsertEltUse);
2776 Ops.push_back(&OpIdx.value());
2777 }
2778 return true;
2779}
2780
2782RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
2784 // TODO: Enable expansion when unaligned access is not supported after we fix
2785 // issues in ExpandMemcmp.
2786 if (!ST->enableUnalignedScalarMem())
2787 return Options;
2788
2789 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
2790 return Options;
2791
2792 Options.AllowOverlappingLoads = true;
2793 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
2794 Options.NumLoadsPerBlock = Options.MaxNumLoads;
2795 if (ST->is64Bit()) {
2796 Options.LoadSizes = {8, 4, 2, 1};
2797 Options.AllowedTailExpansions = {3, 5, 6};
2798 } else {
2799 Options.LoadSizes = {4, 2, 1};
2800 Options.AllowedTailExpansions = {3};
2801 }
2802 return Options;
2803}
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Size
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
static InstructionCost costShuffleViaVRegSplitting(RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID)
This file defines a TargetTransformInfo::Concept conforming object specific to the RISC-V target mach...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:622
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:800
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:799
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:478
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:694
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:922
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:806
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:958
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:379
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:676
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:690
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:688
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:678
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:687
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:681
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:684
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:685
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:680
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:682
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:689
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:686
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:675
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
bool isFPPredicate() const
Definition: InstrTypes.h:780
bool isIntPredicate() const
Definition: InstrTypes.h:781
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:434
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:421
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool noNaNs() const
Definition: FMF.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:572
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
Definition: DerivedTypes.h:607
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:42
The optimization diagnostic interface.
unsigned getMaxLMULForFixedLengthVectors() const
bool hasVInstructionsF64() const
unsigned getRealMinVLen() const
bool useRVVForFixedLengthVectors() const
unsigned getXLen() const
bool hasConditionalMoveFusion() const
bool hasVInstructionsF16() const
bool hasVInstructions() const
std::optional< unsigned > getRealVLen() const
bool hasOptimizedSegmentLoadStore(unsigned NF) const
unsigned getRealMaxVLen() const
bool hasVInstructionsF32() const
unsigned getELen() const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment)
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getVScaleForTuning() const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
std::optional< unsigned > getMaxVScale() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *Src, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool shouldExpandReduction(const IntrinsicInst *II) const
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind)
Return the cost of materializing an immediate for a value operand of a store instruction.
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment)
bool isLegalStridedLoadStore(Type *DataType, Align Alignment)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getRegUsageForType(Type *Ty)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
MVT getContainerForFixedLengthVector(MVT VT) const
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVII::VLMUL getLMUL(MVT VT)
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace, const DataLayout &) const
Returns whether or not generating a interleaved load/store intrinsic for this type will be legal.
The main scalar evolution driver.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
MVT getTypeToPromoteTo(unsigned Op, MVT VT) const
If the action for this operation is to promote, this method returns the ValueType to promote to.
const DataLayout & getDataLayout() const
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
static IntegerType * getInt1Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
Value * getOperand(unsigned i) const
Definition: User.h:228
static std::optional< unsigned > getFunctionalOpcodeForVP(Intrinsic::ID ID)
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:674
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:232
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:218
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:354
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1109
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:395
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:286
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1952
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:341
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:292
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:404
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1841
void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
InstructionCost Cost
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2067
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).