LLVM 22.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
19#include <cmath>
20#include <optional>
21using namespace llvm;
22using namespace llvm::PatternMatch;
23
24#define DEBUG_TYPE "riscvtti"
25
27 "riscv-v-register-bit-width-lmul",
29 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
30 "by autovectorized code. Fractional LMULs are not supported."),
32
34 "riscv-v-slp-max-vf",
36 "Overrides result used for getMaximumVF query which is used "
37 "exclusively by SLP vectorizer."),
39
41 RVVMinTripCount("riscv-v-min-trip-count",
42 cl::desc("Set the lower bound of a trip count to decide on "
43 "vectorization while tail-folding."),
45
47RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
49 // Check if the type is valid for all CostKind
50 if (!VT.isVector())
52 size_t NumInstr = OpCodes.size();
54 return NumInstr;
55 InstructionCost LMULCost = TLI->getLMULCost(VT);
57 return LMULCost * NumInstr;
59 for (auto Op : OpCodes) {
60 switch (Op) {
61 case RISCV::VRGATHER_VI:
62 Cost += TLI->getVRGatherVICost(VT);
63 break;
64 case RISCV::VRGATHER_VV:
65 Cost += TLI->getVRGatherVVCost(VT);
66 break;
67 case RISCV::VSLIDEUP_VI:
68 case RISCV::VSLIDEDOWN_VI:
69 Cost += TLI->getVSlideVICost(VT);
70 break;
71 case RISCV::VSLIDEUP_VX:
72 case RISCV::VSLIDEDOWN_VX:
73 Cost += TLI->getVSlideVXCost(VT);
74 break;
75 case RISCV::VREDMAX_VS:
76 case RISCV::VREDMIN_VS:
77 case RISCV::VREDMAXU_VS:
78 case RISCV::VREDMINU_VS:
79 case RISCV::VREDSUM_VS:
80 case RISCV::VREDAND_VS:
81 case RISCV::VREDOR_VS:
82 case RISCV::VREDXOR_VS:
83 case RISCV::VFREDMAX_VS:
84 case RISCV::VFREDMIN_VS:
85 case RISCV::VFREDUSUM_VS: {
86 unsigned VL = VT.getVectorMinNumElements();
87 if (!VT.isFixedLengthVector())
88 VL *= *getVScaleForTuning();
89 Cost += Log2_32_Ceil(VL);
90 break;
91 }
92 case RISCV::VFREDOSUM_VS: {
93 unsigned VL = VT.getVectorMinNumElements();
94 if (!VT.isFixedLengthVector())
95 VL *= *getVScaleForTuning();
96 Cost += VL;
97 break;
98 }
99 case RISCV::VMV_X_S:
100 case RISCV::VMV_S_X:
101 case RISCV::VFMV_F_S:
102 case RISCV::VFMV_S_F:
103 case RISCV::VMOR_MM:
104 case RISCV::VMXOR_MM:
105 case RISCV::VMAND_MM:
106 case RISCV::VMANDN_MM:
107 case RISCV::VMNAND_MM:
108 case RISCV::VCPOP_M:
109 case RISCV::VFIRST_M:
110 Cost += 1;
111 break;
112 default:
113 Cost += LMULCost;
114 }
115 }
116 return Cost;
117}
118
120 const RISCVSubtarget *ST,
121 const APInt &Imm, Type *Ty,
123 bool FreeZeroes) {
124 assert(Ty->isIntegerTy() &&
125 "getIntImmCost can only estimate cost of materialising integers");
126
127 // We have a Zero register, so 0 is always free.
128 if (Imm == 0)
129 return TTI::TCC_Free;
130
131 // Otherwise, we check how many instructions it will take to materialise.
132 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
133 /*CompressionCost=*/false, FreeZeroes);
134}
135
139 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
140}
141
142// Look for patterns of shift followed by AND that can be turned into a pair of
143// shifts. We won't need to materialize an immediate for the AND so these can
144// be considered free.
145static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
146 uint64_t Mask = Imm.getZExtValue();
147 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
148 if (!BO || !BO->hasOneUse())
149 return false;
150
151 if (BO->getOpcode() != Instruction::Shl)
152 return false;
153
154 if (!isa<ConstantInt>(BO->getOperand(1)))
155 return false;
156
157 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
158 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
159 // is a mask shifted by c2 bits with c3 leading zeros.
160 if (isShiftedMask_64(Mask)) {
161 unsigned Trailing = llvm::countr_zero(Mask);
162 if (ShAmt == Trailing)
163 return true;
164 }
165
166 return false;
167}
168
170 const APInt &Imm, Type *Ty,
172 Instruction *Inst) const {
173 assert(Ty->isIntegerTy() &&
174 "getIntImmCost can only estimate cost of materialising integers");
175
176 // We have a Zero register, so 0 is always free.
177 if (Imm == 0)
178 return TTI::TCC_Free;
179
180 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
181 // commutative, in others the immediate comes from a specific argument index.
182 bool Takes12BitImm = false;
183 unsigned ImmArgIdx = ~0U;
184
185 switch (Opcode) {
186 case Instruction::GetElementPtr:
187 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
188 // split up large offsets in GEP into better parts than ConstantHoisting
189 // can.
190 return TTI::TCC_Free;
191 case Instruction::Store: {
192 // Use the materialization cost regardless of if it's the address or the
193 // value that is constant, except for if the store is misaligned and
194 // misaligned accesses are not legal (experience shows constant hoisting
195 // can sometimes be harmful in such cases).
196 if (Idx == 1 || !Inst)
197 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
198 /*FreeZeroes=*/true);
199
200 StoreInst *ST = cast<StoreInst>(Inst);
201 if (!getTLI()->allowsMemoryAccessForAlignment(
202 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
203 ST->getPointerAddressSpace(), ST->getAlign()))
204 return TTI::TCC_Free;
205
206 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
207 /*FreeZeroes=*/true);
208 }
209 case Instruction::Load:
210 // If the address is a constant, use the materialization cost.
211 return getIntImmCost(Imm, Ty, CostKind);
212 case Instruction::And:
213 // zext.h
214 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
215 return TTI::TCC_Free;
216 // zext.w
217 if (Imm == UINT64_C(0xffffffff) &&
218 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
219 return TTI::TCC_Free;
220 // bclri
221 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
222 return TTI::TCC_Free;
223 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
224 canUseShiftPair(Inst, Imm))
225 return TTI::TCC_Free;
226 Takes12BitImm = true;
227 break;
228 case Instruction::Add:
229 Takes12BitImm = true;
230 break;
231 case Instruction::Or:
232 case Instruction::Xor:
233 // bseti/binvi
234 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
235 return TTI::TCC_Free;
236 Takes12BitImm = true;
237 break;
238 case Instruction::Mul:
239 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
240 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
241 return TTI::TCC_Free;
242 // One more or less than a power of 2 can use SLLI+ADD/SUB.
243 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
244 return TTI::TCC_Free;
245 // FIXME: There is no MULI instruction.
246 Takes12BitImm = true;
247 break;
248 case Instruction::Sub:
249 case Instruction::Shl:
250 case Instruction::LShr:
251 case Instruction::AShr:
252 Takes12BitImm = true;
253 ImmArgIdx = 1;
254 break;
255 default:
256 break;
257 }
258
259 if (Takes12BitImm) {
260 // Check immediate is the correct argument...
261 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
262 // ... and fits into the 12-bit immediate.
263 if (Imm.getSignificantBits() <= 64 &&
264 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
265 return TTI::TCC_Free;
266 }
267 }
268
269 // Otherwise, use the full materialisation cost.
270 return getIntImmCost(Imm, Ty, CostKind);
271 }
272
273 // By default, prevent hoisting.
274 return TTI::TCC_Free;
275}
276
279 const APInt &Imm, Type *Ty,
281 // Prevent hoisting in unknown cases.
282 return TTI::TCC_Free;
283}
284
286 return ST->hasVInstructions();
287}
288
290RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
291 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
292 return ST->hasStdExtZbb() || (ST->hasVendorXCVbitmanip() && !ST->is64Bit())
295}
296
298 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
300 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
302
303 // zve32x is broken for partial_reduce_umla, but let's make sure we
304 // don't generate them.
305 if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 ||
306 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
307 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
308 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
310
311 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
312 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
313 // Note: Asuming all vqdot* variants are equal cost
314 return LT.first *
315 getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second, CostKind);
316}
317
319 // Currently, the ExpandReductions pass can't expand scalable-vector
320 // reductions, but we still request expansion as RVV doesn't support certain
321 // reductions and the SelectionDAG can't legalize them either.
322 switch (II->getIntrinsicID()) {
323 default:
324 return false;
325 // These reductions have no equivalent in RVV
326 case Intrinsic::vector_reduce_mul:
327 case Intrinsic::vector_reduce_fmul:
328 return true;
329 }
330}
331
332std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
333 if (ST->hasVInstructions())
335 return BaseT::getMaxVScale();
336}
337
338std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
339 if (ST->hasVInstructions())
340 if (unsigned MinVLen = ST->getRealMinVLen();
341 MinVLen >= RISCV::RVVBitsPerBlock)
342 return MinVLen / RISCV::RVVBitsPerBlock;
344}
345
348 unsigned LMUL =
349 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
350 switch (K) {
352 return TypeSize::getFixed(ST->getXLen());
354 return TypeSize::getFixed(
355 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
358 (ST->hasVInstructions() &&
361 : 0);
362 }
363
364 llvm_unreachable("Unsupported register kind");
365}
366
368RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
370 // Add a cost of address generation + the cost of the load. The address
371 // is expected to be a PC relative offset to a constant pool entry
372 // using auipc/addi.
373 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
374 /*AddressSpace=*/0, CostKind);
375}
376
377static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
378 unsigned Size = Mask.size();
379 if (!isPowerOf2_32(Size))
380 return false;
381 for (unsigned I = 0; I != Size; ++I) {
382 if (static_cast<unsigned>(Mask[I]) == I)
383 continue;
384 if (Mask[I] != 0)
385 return false;
386 if (Size % I != 0)
387 return false;
388 for (unsigned J = I + 1; J != Size; ++J)
389 // Check the pattern is repeated.
390 if (static_cast<unsigned>(Mask[J]) != J % I)
391 return false;
392 SubVectorSize = I;
393 return true;
394 }
395 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
396 return false;
397}
398
400 LLVMContext &C) {
401 assert((DataVT.getScalarSizeInBits() != 8 ||
402 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
403 MVT IndexVT = DataVT.changeTypeToInteger();
404 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
405 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
406 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
407}
408
409/// Attempt to approximate the cost of a shuffle which will require splitting
410/// during legalization. Note that processShuffleMasks is not an exact proxy
411/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
412/// reasonably close upperbound.
414 MVT LegalVT, VectorType *Tp,
415 ArrayRef<int> Mask,
417 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
418 "Expected fixed vector type and non-empty mask");
419 unsigned LegalNumElts = LegalVT.getVectorNumElements();
420 // Number of destination vectors after legalization:
421 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
422 // We are going to permute multiple sources and the result will be in
423 // multiple destinations. Providing an accurate cost only for splits where
424 // the element type remains the same.
425 if (NumOfDests <= 1 ||
427 Tp->getElementType()->getPrimitiveSizeInBits() ||
428 LegalNumElts >= Tp->getElementCount().getFixedValue())
430
431 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
432 unsigned LegalVTSize = LegalVT.getStoreSize();
433 // Number of source vectors after legalization:
434 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
435
436 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
437
438 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
439 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
440 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
441 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
442 assert(NormalizedVF >= Mask.size() &&
443 "Normalized mask expected to be not shorter than original mask.");
444 copy(Mask, NormalizedMask.begin());
446 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
448 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
449 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
450 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
451 return;
452 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
453 .second)
454 return;
457 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
458 SingleOpTy, RegMask, CostKind, 0, nullptr);
459 },
460 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
463 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
464 SingleOpTy, RegMask, CostKind, 0, nullptr);
465 });
466 return Cost;
467}
468
469/// Try to perform better estimation of the permutation.
470/// 1. Split the source/destination vectors into real registers.
471/// 2. Do the mask analysis to identify which real registers are
472/// permuted. If more than 1 source registers are used for the
473/// destination register building, the cost for this destination register
474/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
475/// source register is used, build mask and calculate the cost as a cost
476/// of PermuteSingleSrc.
477/// Also, for the single register permute we try to identify if the
478/// destination register is just a copy of the source register or the
479/// copy of the previous destination register (the cost is
480/// TTI::TCC_Basic). If the source register is just reused, the cost for
481/// this operation is 0.
482static InstructionCost
484 std::optional<unsigned> VLen, VectorType *Tp,
486 assert(LegalVT.isFixedLengthVector());
487 if (!VLen || Mask.empty())
489 MVT ElemVT = LegalVT.getVectorElementType();
490 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
491 LegalVT = TTI.getTypeLegalizationCost(
492 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
493 .second;
494 // Number of destination vectors after legalization:
495 InstructionCost NumOfDests =
496 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
497 if (NumOfDests <= 1 ||
499 Tp->getElementType()->getPrimitiveSizeInBits() ||
500 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
502
503 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
504 unsigned LegalVTSize = LegalVT.getStoreSize();
505 // Number of source vectors after legalization:
506 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
507
508 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
509 LegalVT.getVectorNumElements());
510
511 unsigned E = NumOfDests.getValue();
512 unsigned NormalizedVF =
513 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
514 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
515 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
516 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
517 assert(NormalizedVF >= Mask.size() &&
518 "Normalized mask expected to be not shorter than original mask.");
519 copy(Mask, NormalizedMask.begin());
521 int NumShuffles = 0;
522 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
524 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
525 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
526 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
527 return;
528 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
529 .second)
530 return;
531 ++NumShuffles;
533 SingleOpTy, RegMask, CostKind, 0, nullptr);
534 },
535 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
537 SingleOpTy, RegMask, CostKind, 0, nullptr);
538 NumShuffles += 2;
539 });
540 // Note: check that we do not emit too many shuffles here to prevent code
541 // size explosion.
542 // TODO: investigate, if it can be improved by extra analysis of the masks
543 // to check if the code is more profitable.
544 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
545 (NumOfDestRegs <= 2 && NumShuffles < 4))
546 return Cost;
548}
549
550InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
551 ArrayRef<int> Mask,
553 // Avoid missing masks and length changing shuffles
554 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
556
557 int NumElts = Tp->getNumElements();
558 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
559 // Avoid scalarization cases
560 if (!LT.second.isFixedLengthVector())
562
563 // Requires moving elements between parts, which requires additional
564 // unmodeled instructions.
565 if (LT.first != 1)
567
568 auto GetSlideOpcode = [&](int SlideAmt) {
569 assert(SlideAmt != 0);
570 bool IsVI = isUInt<5>(std::abs(SlideAmt));
571 if (SlideAmt < 0)
572 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
573 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
574 };
575
576 std::array<std::pair<int, int>, 2> SrcInfo;
577 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
579
580 if (SrcInfo[1].second == 0)
581 std::swap(SrcInfo[0], SrcInfo[1]);
582
583 InstructionCost FirstSlideCost = 0;
584 if (SrcInfo[0].second != 0) {
585 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
586 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
587 }
588
589 if (SrcInfo[1].first == -1)
590 return FirstSlideCost;
591
592 InstructionCost SecondSlideCost = 0;
593 if (SrcInfo[1].second != 0) {
594 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
595 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
596 } else {
597 SecondSlideCost =
598 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
599 }
600
601 auto EC = Tp->getElementCount();
602 VectorType *MaskTy =
604 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
605 return FirstSlideCost + SecondSlideCost + MaskCost;
606}
607
610 VectorType *SrcTy, ArrayRef<int> Mask,
611 TTI::TargetCostKind CostKind, int Index,
613 const Instruction *CxtI) const {
614 assert((Mask.empty() || DstTy->isScalableTy() ||
615 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
616 "Expected the Mask to match the return size if given");
617 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
618 "Expected the same scalar types");
619
620 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
621 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
622
623 // First, handle cases where having a fixed length vector enables us to
624 // give a more accurate cost than falling back to generic scalable codegen.
625 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
626 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
627 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
629 *this, LT.second, ST->getRealVLen(),
630 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
631 if (VRegSplittingCost.isValid())
632 return VRegSplittingCost;
633 switch (Kind) {
634 default:
635 break;
637 if (Mask.size() >= 2) {
638 MVT EltTp = LT.second.getVectorElementType();
639 // If the size of the element is < ELEN then shuffles of interleaves and
640 // deinterleaves of 2 vectors can be lowered into the following
641 // sequences
642 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
643 // Example sequence:
644 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
645 // vwaddu.vv v10, v8, v9
646 // li a0, -1 (ignored)
647 // vwmaccu.vx v10, a0, v9
648 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
649 return 2 * LT.first * TLI->getLMULCost(LT.second);
650
651 if (Mask[0] == 0 || Mask[0] == 1) {
652 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
653 // Example sequence:
654 // vnsrl.wi v10, v8, 0
655 if (equal(DeinterleaveMask, Mask))
656 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
657 LT.second, CostKind);
658 }
659 }
660 int SubVectorSize;
661 if (LT.second.getScalarSizeInBits() != 1 &&
662 isRepeatedConcatMask(Mask, SubVectorSize)) {
664 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
665 // The cost of extraction from a subvector is 0 if the index is 0.
666 for (unsigned I = 0; I != NumSlides; ++I) {
667 unsigned InsertIndex = SubVectorSize * (1 << I);
668 FixedVectorType *SubTp =
669 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
670 FixedVectorType *DestTp =
672 std::pair<InstructionCost, MVT> DestLT =
674 // Add the cost of whole vector register move because the
675 // destination vector register group for vslideup cannot overlap the
676 // source.
677 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
678 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
679 CostKind, InsertIndex, SubTp);
680 }
681 return Cost;
682 }
683 }
684
685 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
686 SlideCost.isValid())
687 return SlideCost;
688
689 // vrgather + cost of generating the mask constant.
690 // We model this for an unknown mask with a single vrgather.
691 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
692 LT.second.getVectorNumElements() <= 256)) {
693 VectorType *IdxTy =
694 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
695 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
696 return IndexCost +
697 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
698 }
699 break;
700 }
703
704 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
705 SlideCost.isValid())
706 return SlideCost;
707
708 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
709 // register for the second vrgather. We model this for an unknown
710 // (shuffle) mask.
711 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
712 LT.second.getVectorNumElements() <= 256)) {
713 auto &C = SrcTy->getContext();
714 auto EC = SrcTy->getElementCount();
715 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
717 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
718 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
719 return 2 * IndexCost +
720 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
721 LT.second, CostKind) +
722 MaskCost;
723 }
724 break;
725 }
726 }
727
728 auto shouldSplit = [](TTI::ShuffleKind Kind) {
729 switch (Kind) {
730 default:
731 return false;
735 return true;
736 }
737 };
738
739 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
740 shouldSplit(Kind)) {
741 InstructionCost SplitCost =
742 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
743 if (SplitCost.isValid())
744 return SplitCost;
745 }
746 }
747
748 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
749 switch (Kind) {
750 default:
751 // Fallthrough to generic handling.
752 // TODO: Most of these cases will return getInvalid in generic code, and
753 // must be implemented here.
754 break;
756 // Extract at zero is always a subregister extract
757 if (Index == 0)
758 return TTI::TCC_Free;
759
760 // If we're extracting a subvector of at most m1 size at a sub-register
761 // boundary - which unfortunately we need exact vlen to identify - this is
762 // a subregister extract at worst and thus won't require a vslidedown.
763 // TODO: Extend for aligned m2, m4 subvector extracts
764 // TODO: Extend for misalgined (but contained) extracts
765 // TODO: Extend for scalable subvector types
766 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
767 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
768 if (std::optional<unsigned> VLen = ST->getRealVLen();
769 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
770 SubLT.second.getSizeInBits() <= *VLen)
771 return TTI::TCC_Free;
772 }
773
774 // Example sequence:
775 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
776 // vslidedown.vi v8, v9, 2
777 return LT.first *
778 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
780 // Example sequence:
781 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
782 // vslideup.vi v8, v9, 2
783 LT = getTypeLegalizationCost(DstTy);
784 return LT.first *
785 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
786 case TTI::SK_Select: {
787 // Example sequence:
788 // li a0, 90
789 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
790 // vmv.s.x v0, a0
791 // vmerge.vvm v8, v9, v8, v0
792 // We use 2 for the cost of the mask materialization as this is the true
793 // cost for small masks and most shuffles are small. At worst, this cost
794 // should be a very small constant for the constant pool load. As such,
795 // we may bias towards large selects slightly more than truly warranted.
796 return LT.first *
797 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
798 LT.second, CostKind));
799 }
800 case TTI::SK_Broadcast: {
801 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
802 Instruction::InsertElement);
803 if (LT.second.getScalarSizeInBits() == 1) {
804 if (HasScalar) {
805 // Example sequence:
806 // andi a0, a0, 1
807 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
808 // vmv.v.x v8, a0
809 // vmsne.vi v0, v8, 0
810 return LT.first *
811 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
812 LT.second, CostKind));
813 }
814 // Example sequence:
815 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
816 // vmv.v.i v8, 0
817 // vmerge.vim v8, v8, 1, v0
818 // vmv.x.s a0, v8
819 // andi a0, a0, 1
820 // vmv.v.x v8, a0
821 // vmsne.vi v0, v8, 0
822
823 return LT.first *
824 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
825 RISCV::VMV_X_S, RISCV::VMV_V_X,
826 RISCV::VMSNE_VI},
827 LT.second, CostKind));
828 }
829
830 if (HasScalar) {
831 // Example sequence:
832 // vmv.v.x v8, a0
833 return LT.first *
834 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
835 }
836
837 // Example sequence:
838 // vrgather.vi v9, v8, 0
839 return LT.first *
840 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
841 }
842 case TTI::SK_Splice: {
843 // vslidedown+vslideup.
844 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
845 // of similar code, but I think we expand through memory.
846 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
847 if (Index >= 0 && Index < 32)
848 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
849 else if (Index < 0 && Index > -32)
850 Opcodes[1] = RISCV::VSLIDEUP_VI;
851 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
852 }
853 case TTI::SK_Reverse: {
854
855 if (!LT.second.isVector())
857
858 // TODO: Cases to improve here:
859 // * Illegal vector types
860 // * i64 on RV32
861 if (SrcTy->getElementType()->isIntegerTy(1)) {
862 VectorType *WideTy =
864 cast<VectorType>(SrcTy)->getElementCount());
865 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
867 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
868 nullptr) +
869 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
871 }
872
873 MVT ContainerVT = LT.second;
874 if (LT.second.isFixedLengthVector())
875 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
876 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
877 if (ContainerVT.bitsLE(M1VT)) {
878 // Example sequence:
879 // csrr a0, vlenb
880 // srli a0, a0, 3
881 // addi a0, a0, -1
882 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
883 // vid.v v9
884 // vrsub.vx v10, v9, a0
885 // vrgather.vv v9, v8, v10
886 InstructionCost LenCost = 3;
887 if (LT.second.isFixedLengthVector())
888 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
889 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
890 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
891 if (LT.second.isFixedLengthVector() &&
892 isInt<5>(LT.second.getVectorNumElements() - 1))
893 Opcodes[1] = RISCV::VRSUB_VI;
894 InstructionCost GatherCost =
895 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
896 return LT.first * (LenCost + GatherCost);
897 }
898
899 // At high LMUL, we split into a series of M1 reverses (see
900 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
901 // the resulting gap at the bottom (for fixed vectors only). The important
902 // bit is that the cost scales linearly, not quadratically with LMUL.
903 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
904 InstructionCost FixedCost =
905 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
906 unsigned Ratio =
908 InstructionCost GatherCost =
909 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
910 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
911 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
912 return FixedCost + LT.first * (GatherCost + SlideCost);
913 }
914 }
915 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
916 SubTp);
917}
918
919static unsigned isM1OrSmaller(MVT VT) {
921 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
925}
926
928 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
929 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
930 ArrayRef<Value *> VL) const {
931 if (isa<ScalableVectorType>(Ty))
933
934 // A build_vector (which is m1 sized or smaller) can be done in no
935 // worse than one vslide1down.vx per element in the type. We could
936 // in theory do an explode_vector in the inverse manner, but our
937 // lowering today does not have a first class node for this pattern.
939 Ty, DemandedElts, Insert, Extract, CostKind);
940 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
941 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
942 if (Ty->getScalarSizeInBits() == 1) {
943 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
944 // Note: Implicit scalar anyextend is assumed to be free since the i1
945 // must be stored in a GPR.
946 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
947 CostKind) +
948 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
950 }
951
952 assert(LT.second.isFixedLengthVector());
953 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
954 if (isM1OrSmaller(ContainerVT)) {
955 InstructionCost BV =
956 cast<FixedVectorType>(Ty)->getNumElements() *
957 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
958 if (BV < Cost)
959 Cost = BV;
960 }
961 }
962 return Cost;
963}
964
966RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
967 unsigned AddressSpace,
969 if (!isLegalMaskedLoadStore(Src, Alignment) ||
971 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
972 CostKind);
973
974 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
975}
976
978 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
979 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
980 bool UseMaskForCond, bool UseMaskForGaps) const {
981
982 // The interleaved memory access pass will lower (de)interleave ops combined
983 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
984 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
985 // gap).
986 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
987 auto *VTy = cast<VectorType>(VecTy);
988 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
989 // Need to make sure type has't been scalarized
990 if (LT.second.isVector()) {
991 auto *SubVecTy =
992 VectorType::get(VTy->getElementType(),
993 VTy->getElementCount().divideCoefficientBy(Factor));
994 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
995 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
996 AddressSpace, DL)) {
997
998 // Some processors optimize segment loads/stores as one wide memory op +
999 // Factor * LMUL shuffle ops.
1000 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1002 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1003 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1004 Cost += Factor * TLI->getLMULCost(SubVecVT);
1005 return LT.first * Cost;
1006 }
1007
1008 // Otherwise, the cost is proportional to the number of elements (VL *
1009 // Factor ops).
1010 InstructionCost MemOpCost =
1011 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1012 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1013 unsigned NumLoads = getEstimatedVLFor(VTy);
1014 return NumLoads * MemOpCost;
1015 }
1016 }
1017 }
1018
1019 // TODO: Return the cost of interleaved accesses for scalable vector when
1020 // unable to convert to segment accesses instructions.
1021 if (isa<ScalableVectorType>(VecTy))
1023
1024 auto *FVTy = cast<FixedVectorType>(VecTy);
1025 InstructionCost MemCost =
1026 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1027 unsigned VF = FVTy->getNumElements() / Factor;
1028
1029 // An interleaved load will look like this for Factor=3:
1030 // %wide.vec = load <12 x i32>, ptr %3, align 4
1031 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1032 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1033 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1034 if (Opcode == Instruction::Load) {
1035 InstructionCost Cost = MemCost;
1036 for (unsigned Index : Indices) {
1037 FixedVectorType *VecTy =
1038 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1039 auto Mask = createStrideMask(Index, Factor, VF);
1040 Mask.resize(VF * Factor, -1);
1041 InstructionCost ShuffleCost =
1043 Mask, CostKind, 0, nullptr, {});
1044 Cost += ShuffleCost;
1045 }
1046 return Cost;
1047 }
1048
1049 // TODO: Model for NF > 2
1050 // We'll need to enhance getShuffleCost to model shuffles that are just
1051 // inserts and extracts into subvectors, since they won't have the full cost
1052 // of a vrgather.
1053 // An interleaved store for 3 vectors of 4 lanes will look like
1054 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1055 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1056 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1057 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1058 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1059 if (Factor != 2)
1060 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1061 Alignment, AddressSpace, CostKind,
1062 UseMaskForCond, UseMaskForGaps);
1063
1064 assert(Opcode == Instruction::Store && "Opcode must be a store");
1065 // For an interleaving store of 2 vectors, we perform one large interleaving
1066 // shuffle that goes into the wide store
1067 auto Mask = createInterleaveMask(VF, Factor);
1068 InstructionCost ShuffleCost =
1070 CostKind, 0, nullptr, {});
1071 return MemCost + ShuffleCost;
1072}
1073
1075 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1076 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1078 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1079 Alignment, CostKind, I);
1080
1081 if ((Opcode == Instruction::Load &&
1082 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1083 (Opcode == Instruction::Store &&
1084 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1085 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1086 Alignment, CostKind, I);
1087
1088 // Cost is proportional to the number of memory operations implied. For
1089 // scalable vectors, we use an estimate on that number since we don't
1090 // know exactly what VL will be.
1091 auto &VTy = *cast<VectorType>(DataTy);
1092 InstructionCost MemOpCost =
1093 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1094 {TTI::OK_AnyValue, TTI::OP_None}, I);
1095 unsigned NumLoads = getEstimatedVLFor(&VTy);
1096 return NumLoads * MemOpCost;
1097}
1098
1100 unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
1101 TTI::TargetCostKind CostKind, const Instruction *I) const {
1102 bool IsLegal = (Opcode == Instruction::Store &&
1103 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1104 (Opcode == Instruction::Load &&
1105 isLegalMaskedExpandLoad(DataTy, Alignment));
1106 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1107 return BaseT::getExpandCompressMemoryOpCost(Opcode, DataTy, VariableMask,
1108 Alignment, CostKind, I);
1109 // Example compressstore sequence:
1110 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1111 // vcompress.vm v10, v8, v0
1112 // vcpop.m a1, v0
1113 // vsetvli zero, a1, e32, m2, ta, ma
1114 // vse32.v v10, (a0)
1115 // Example expandload sequence:
1116 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1117 // vcpop.m a1, v0
1118 // vsetvli zero, a1, e32, m2, ta, ma
1119 // vle32.v v10, (a0)
1120 // vsetivli zero, 8, e32, m2, ta, ma
1121 // viota.m v12, v0
1122 // vrgather.vv v8, v10, v12, v0.t
1123 auto MemOpCost =
1124 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1125 auto LT = getTypeLegalizationCost(DataTy);
1126 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1127 if (VariableMask)
1128 Opcodes.push_back(RISCV::VCPOP_M);
1129 if (Opcode == Instruction::Store)
1130 Opcodes.append({RISCV::VCOMPRESS_VM});
1131 else
1132 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1133 return MemOpCost +
1134 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1135}
1136
1138 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1139 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1140 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1141 !isLegalStridedLoadStore(DataTy, Alignment)) ||
1142 (Opcode != Instruction::Load && Opcode != Instruction::Store))
1143 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
1144 Alignment, CostKind, I);
1145
1147 return TTI::TCC_Basic;
1148
1149 // Cost is proportional to the number of memory operations implied. For
1150 // scalable vectors, we use an estimate on that number since we don't
1151 // know exactly what VL will be.
1152 auto &VTy = *cast<VectorType>(DataTy);
1153 InstructionCost MemOpCost =
1154 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1155 {TTI::OK_AnyValue, TTI::OP_None}, I);
1156 unsigned NumLoads = getEstimatedVLFor(&VTy);
1157 return NumLoads * MemOpCost;
1158}
1159
1162 // FIXME: This is a property of the default vector convention, not
1163 // all possible calling conventions. Fixing that will require
1164 // some TTI API and SLP rework.
1167 for (auto *Ty : Tys) {
1168 if (!Ty->isVectorTy())
1169 continue;
1170 Align A = DL.getPrefTypeAlign(Ty);
1171 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1172 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1173 }
1174 return Cost;
1175}
1176
1177// Currently, these represent both throughput and codesize costs
1178// for the respective intrinsics. The costs in this table are simply
1179// instruction counts with the following adjustments made:
1180// * One vsetvli is considered free.
1182 {Intrinsic::floor, MVT::f32, 9},
1183 {Intrinsic::floor, MVT::f64, 9},
1184 {Intrinsic::ceil, MVT::f32, 9},
1185 {Intrinsic::ceil, MVT::f64, 9},
1186 {Intrinsic::trunc, MVT::f32, 7},
1187 {Intrinsic::trunc, MVT::f64, 7},
1188 {Intrinsic::round, MVT::f32, 9},
1189 {Intrinsic::round, MVT::f64, 9},
1190 {Intrinsic::roundeven, MVT::f32, 9},
1191 {Intrinsic::roundeven, MVT::f64, 9},
1192 {Intrinsic::rint, MVT::f32, 7},
1193 {Intrinsic::rint, MVT::f64, 7},
1194 {Intrinsic::nearbyint, MVT::f32, 9},
1195 {Intrinsic::nearbyint, MVT::f64, 9},
1196 {Intrinsic::bswap, MVT::i16, 3},
1197 {Intrinsic::bswap, MVT::i32, 12},
1198 {Intrinsic::bswap, MVT::i64, 31},
1199 {Intrinsic::vp_bswap, MVT::i16, 3},
1200 {Intrinsic::vp_bswap, MVT::i32, 12},
1201 {Intrinsic::vp_bswap, MVT::i64, 31},
1202 {Intrinsic::vp_fshl, MVT::i8, 7},
1203 {Intrinsic::vp_fshl, MVT::i16, 7},
1204 {Intrinsic::vp_fshl, MVT::i32, 7},
1205 {Intrinsic::vp_fshl, MVT::i64, 7},
1206 {Intrinsic::vp_fshr, MVT::i8, 7},
1207 {Intrinsic::vp_fshr, MVT::i16, 7},
1208 {Intrinsic::vp_fshr, MVT::i32, 7},
1209 {Intrinsic::vp_fshr, MVT::i64, 7},
1210 {Intrinsic::bitreverse, MVT::i8, 17},
1211 {Intrinsic::bitreverse, MVT::i16, 24},
1212 {Intrinsic::bitreverse, MVT::i32, 33},
1213 {Intrinsic::bitreverse, MVT::i64, 52},
1214 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1215 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1216 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1217 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1218 {Intrinsic::ctpop, MVT::i8, 12},
1219 {Intrinsic::ctpop, MVT::i16, 19},
1220 {Intrinsic::ctpop, MVT::i32, 20},
1221 {Intrinsic::ctpop, MVT::i64, 21},
1222 {Intrinsic::ctlz, MVT::i8, 19},
1223 {Intrinsic::ctlz, MVT::i16, 28},
1224 {Intrinsic::ctlz, MVT::i32, 31},
1225 {Intrinsic::ctlz, MVT::i64, 35},
1226 {Intrinsic::cttz, MVT::i8, 16},
1227 {Intrinsic::cttz, MVT::i16, 23},
1228 {Intrinsic::cttz, MVT::i32, 24},
1229 {Intrinsic::cttz, MVT::i64, 25},
1230 {Intrinsic::vp_ctpop, MVT::i8, 12},
1231 {Intrinsic::vp_ctpop, MVT::i16, 19},
1232 {Intrinsic::vp_ctpop, MVT::i32, 20},
1233 {Intrinsic::vp_ctpop, MVT::i64, 21},
1234 {Intrinsic::vp_ctlz, MVT::i8, 19},
1235 {Intrinsic::vp_ctlz, MVT::i16, 28},
1236 {Intrinsic::vp_ctlz, MVT::i32, 31},
1237 {Intrinsic::vp_ctlz, MVT::i64, 35},
1238 {Intrinsic::vp_cttz, MVT::i8, 16},
1239 {Intrinsic::vp_cttz, MVT::i16, 23},
1240 {Intrinsic::vp_cttz, MVT::i32, 24},
1241 {Intrinsic::vp_cttz, MVT::i64, 25},
1242};
1243
1247 auto *RetTy = ICA.getReturnType();
1248 switch (ICA.getID()) {
1249 case Intrinsic::lrint:
1250 case Intrinsic::llrint:
1251 case Intrinsic::lround:
1252 case Intrinsic::llround: {
1253 auto LT = getTypeLegalizationCost(RetTy);
1254 Type *SrcTy = ICA.getArgTypes().front();
1255 auto SrcLT = getTypeLegalizationCost(SrcTy);
1256 if (ST->hasVInstructions() && LT.second.isVector()) {
1258 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1259 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1260 if (LT.second.getVectorElementType() == MVT::bf16) {
1261 if (!ST->hasVInstructionsBF16Minimal())
1263 if (DstEltSz == 32)
1264 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1265 else
1266 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1267 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1268 !ST->hasVInstructionsF16()) {
1269 if (!ST->hasVInstructionsF16Minimal())
1271 if (DstEltSz == 32)
1272 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1273 else
1274 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1275
1276 } else if (SrcEltSz > DstEltSz) {
1277 Ops = {RISCV::VFNCVT_X_F_W};
1278 } else if (SrcEltSz < DstEltSz) {
1279 Ops = {RISCV::VFWCVT_X_F_V};
1280 } else {
1281 Ops = {RISCV::VFCVT_X_F_V};
1282 }
1283
1284 // We need to use the source LMUL in the case of a narrowing op, and the
1285 // destination LMUL otherwise.
1286 if (SrcEltSz > DstEltSz)
1287 return SrcLT.first *
1288 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1289 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1290 }
1291 break;
1292 }
1293 case Intrinsic::ceil:
1294 case Intrinsic::floor:
1295 case Intrinsic::trunc:
1296 case Intrinsic::rint:
1297 case Intrinsic::round:
1298 case Intrinsic::roundeven: {
1299 // These all use the same code.
1300 auto LT = getTypeLegalizationCost(RetTy);
1301 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1302 return LT.first * 8;
1303 break;
1304 }
1305 case Intrinsic::umin:
1306 case Intrinsic::umax:
1307 case Intrinsic::smin:
1308 case Intrinsic::smax: {
1309 auto LT = getTypeLegalizationCost(RetTy);
1310 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1311 return LT.first;
1312
1313 if (ST->hasVInstructions() && LT.second.isVector()) {
1314 unsigned Op;
1315 switch (ICA.getID()) {
1316 case Intrinsic::umin:
1317 Op = RISCV::VMINU_VV;
1318 break;
1319 case Intrinsic::umax:
1320 Op = RISCV::VMAXU_VV;
1321 break;
1322 case Intrinsic::smin:
1323 Op = RISCV::VMIN_VV;
1324 break;
1325 case Intrinsic::smax:
1326 Op = RISCV::VMAX_VV;
1327 break;
1328 }
1329 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1330 }
1331 break;
1332 }
1333 case Intrinsic::sadd_sat:
1334 case Intrinsic::ssub_sat:
1335 case Intrinsic::uadd_sat:
1336 case Intrinsic::usub_sat: {
1337 auto LT = getTypeLegalizationCost(RetTy);
1338 if (ST->hasVInstructions() && LT.second.isVector()) {
1339 unsigned Op;
1340 switch (ICA.getID()) {
1341 case Intrinsic::sadd_sat:
1342 Op = RISCV::VSADD_VV;
1343 break;
1344 case Intrinsic::ssub_sat:
1345 Op = RISCV::VSSUBU_VV;
1346 break;
1347 case Intrinsic::uadd_sat:
1348 Op = RISCV::VSADDU_VV;
1349 break;
1350 case Intrinsic::usub_sat:
1351 Op = RISCV::VSSUBU_VV;
1352 break;
1353 }
1354 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1355 }
1356 break;
1357 }
1358 case Intrinsic::fma:
1359 case Intrinsic::fmuladd: {
1360 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1361 auto LT = getTypeLegalizationCost(RetTy);
1362 if (ST->hasVInstructions() && LT.second.isVector())
1363 return LT.first *
1364 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1365 break;
1366 }
1367 case Intrinsic::fabs: {
1368 auto LT = getTypeLegalizationCost(RetTy);
1369 if (ST->hasVInstructions() && LT.second.isVector()) {
1370 // lui a0, 8
1371 // addi a0, a0, -1
1372 // vsetvli a1, zero, e16, m1, ta, ma
1373 // vand.vx v8, v8, a0
1374 // f16 with zvfhmin and bf16 with zvfhbmin
1375 if (LT.second.getVectorElementType() == MVT::bf16 ||
1376 (LT.second.getVectorElementType() == MVT::f16 &&
1377 !ST->hasVInstructionsF16()))
1378 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1379 CostKind) +
1380 2;
1381 else
1382 return LT.first *
1383 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1384 }
1385 break;
1386 }
1387 case Intrinsic::sqrt: {
1388 auto LT = getTypeLegalizationCost(RetTy);
1389 if (ST->hasVInstructions() && LT.second.isVector()) {
1392 MVT ConvType = LT.second;
1393 MVT FsqrtType = LT.second;
1394 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1395 // will be spilt.
1396 if (LT.second.getVectorElementType() == MVT::bf16) {
1397 if (LT.second == MVT::nxv32bf16) {
1398 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1399 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1400 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1401 ConvType = MVT::nxv16f16;
1402 FsqrtType = MVT::nxv16f32;
1403 } else {
1404 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1405 FsqrtOp = {RISCV::VFSQRT_V};
1406 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1407 }
1408 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1409 !ST->hasVInstructionsF16()) {
1410 if (LT.second == MVT::nxv32f16) {
1411 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1412 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1413 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1414 ConvType = MVT::nxv16f16;
1415 FsqrtType = MVT::nxv16f32;
1416 } else {
1417 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1418 FsqrtOp = {RISCV::VFSQRT_V};
1419 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1420 }
1421 } else {
1422 FsqrtOp = {RISCV::VFSQRT_V};
1423 }
1424
1425 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1426 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1427 }
1428 break;
1429 }
1430 case Intrinsic::cttz:
1431 case Intrinsic::ctlz:
1432 case Intrinsic::ctpop: {
1433 auto LT = getTypeLegalizationCost(RetTy);
1434 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1435 unsigned Op;
1436 switch (ICA.getID()) {
1437 case Intrinsic::cttz:
1438 Op = RISCV::VCTZ_V;
1439 break;
1440 case Intrinsic::ctlz:
1441 Op = RISCV::VCLZ_V;
1442 break;
1443 case Intrinsic::ctpop:
1444 Op = RISCV::VCPOP_V;
1445 break;
1446 }
1447 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1448 }
1449 break;
1450 }
1451 case Intrinsic::abs: {
1452 auto LT = getTypeLegalizationCost(RetTy);
1453 if (ST->hasVInstructions() && LT.second.isVector()) {
1454 // vrsub.vi v10, v8, 0
1455 // vmax.vv v8, v8, v10
1456 return LT.first *
1457 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1458 LT.second, CostKind);
1459 }
1460 break;
1461 }
1462 case Intrinsic::get_active_lane_mask: {
1463 if (ST->hasVInstructions()) {
1464 Type *ExpRetTy = VectorType::get(
1465 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1466 auto LT = getTypeLegalizationCost(ExpRetTy);
1467
1468 // vid.v v8 // considered hoisted
1469 // vsaddu.vx v8, v8, a0
1470 // vmsltu.vx v0, v8, a1
1471 return LT.first *
1472 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1473 LT.second, CostKind);
1474 }
1475 break;
1476 }
1477 // TODO: add more intrinsic
1478 case Intrinsic::stepvector: {
1479 auto LT = getTypeLegalizationCost(RetTy);
1480 // Legalisation of illegal types involves an `index' instruction plus
1481 // (LT.first - 1) vector adds.
1482 if (ST->hasVInstructions())
1483 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1484 (LT.first - 1) *
1485 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1486 return 1 + (LT.first - 1);
1487 }
1488 case Intrinsic::experimental_cttz_elts: {
1489 Type *ArgTy = ICA.getArgTypes()[0];
1490 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1491 if (getTLI()->shouldExpandCttzElements(ArgType))
1492 break;
1493 InstructionCost Cost = getRISCVInstructionCost(
1494 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1495
1496 // If zero_is_poison is false, then we will generate additional
1497 // cmp + select instructions to convert -1 to EVL.
1498 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1499 if (ICA.getArgs().size() > 1 &&
1500 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1501 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1503 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1505
1506 return Cost;
1507 }
1508 case Intrinsic::experimental_vp_splat: {
1509 auto LT = getTypeLegalizationCost(RetTy);
1510 // TODO: Lower i1 experimental_vp_splat
1511 if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1)
1513 return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint()
1514 ? RISCV::VFMV_V_F
1515 : RISCV::VMV_V_X,
1516 LT.second, CostKind);
1517 }
1518 case Intrinsic::experimental_vp_splice: {
1519 // To support type-based query from vectorizer, set the index to 0.
1520 // Note that index only change the cost from vslide.vx to vslide.vi and in
1521 // current implementations they have same costs.
1522 return getShuffleCost(TTI::SK_Splice, cast<VectorType>(ICA.getReturnType()),
1523 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1524 0, cast<VectorType>(ICA.getReturnType()));
1525 }
1526 case Intrinsic::fptoui_sat:
1527 case Intrinsic::fptosi_sat: {
1529 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1530 Type *SrcTy = ICA.getArgTypes()[0];
1531
1532 auto SrcLT = getTypeLegalizationCost(SrcTy);
1533 auto DstLT = getTypeLegalizationCost(RetTy);
1534 if (!SrcTy->isVectorTy())
1535 break;
1536
1537 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1539
1540 Cost +=
1541 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1543
1544 // Handle NaN.
1545 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1546 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1547 Type *CondTy = RetTy->getWithNewBitWidth(1);
1548 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1550 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1552 return Cost;
1553 }
1554 }
1555
1556 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1557 if (auto LT = getTypeLegalizationCost(RetTy);
1558 LT.second.isVector()) {
1559 MVT EltTy = LT.second.getVectorElementType();
1560 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1561 ICA.getID(), EltTy))
1562 return LT.first * Entry->Cost;
1563 }
1564 }
1565
1567}
1568
1571 const SCEV *Ptr,
1573 // Address computations for vector indexed load/store likely require an offset
1574 // and/or scaling.
1575 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1576 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1577
1578 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1579}
1580
1582 Type *Src,
1585 const Instruction *I) const {
1586 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1587 if (!IsVectorType)
1588 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1589
1590 // FIXME: Need to compute legalizing cost for illegal types. The current
1591 // code handles only legal types and those which can be trivially
1592 // promoted to legal.
1593 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1594 Dst->getScalarSizeInBits() > ST->getELen())
1595 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1596
1597 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1598 assert(ISD && "Invalid opcode");
1599 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1600 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1601
1602 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1603 // The shared implementation doesn't model vector widening during legalization
1604 // and instead assumes scalarization. In order to scalarize an <N x i1>
1605 // vector, we need to extend/trunc to/from i8. If we don't special case
1606 // this, we can get an infinite recursion cycle.
1607 switch (ISD) {
1608 default:
1609 break;
1610 case ISD::SIGN_EXTEND:
1611 case ISD::ZERO_EXTEND:
1612 if (Src->getScalarSizeInBits() == 1) {
1613 // We do not use vsext/vzext to extend from mask vector.
1614 // Instead we use the following instructions to extend from mask vector:
1615 // vmv.v.i v8, 0
1616 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1617 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1618 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1619 DstLT.second, CostKind) +
1620 DstLT.first - 1;
1621 }
1622 break;
1623 case ISD::TRUNCATE:
1624 if (Dst->getScalarSizeInBits() == 1) {
1625 // We do not use several vncvt to truncate to mask vector. So we could
1626 // not use PowDiff to calculate it.
1627 // Instead we use the following instructions to truncate to mask vector:
1628 // vand.vi v8, v8, 1
1629 // vmsne.vi v0, v8, 0
1630 return SrcLT.first *
1631 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1632 SrcLT.second, CostKind) +
1633 SrcLT.first - 1;
1634 }
1635 break;
1636 };
1637
1638 // Our actual lowering for the case where a wider legal type is available
1639 // uses promotion to the wider type. This is reflected in the result of
1640 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1641 // scalarized if the legalized Src and Dst are not equal sized.
1642 const DataLayout &DL = this->getDataLayout();
1643 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1644 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1646 SrcLT.second.getSizeInBits()) ||
1648 DstLT.second.getSizeInBits()))
1649 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1650
1651 // The split cost is handled by the base getCastInstrCost
1652 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1653
1654 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1655 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1656 switch (ISD) {
1657 case ISD::SIGN_EXTEND:
1658 case ISD::ZERO_EXTEND: {
1659 if ((PowDiff < 1) || (PowDiff > 3))
1660 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1661 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1662 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1663 unsigned Op =
1664 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1665 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1666 }
1667 case ISD::TRUNCATE:
1668 case ISD::FP_EXTEND:
1669 case ISD::FP_ROUND: {
1670 // Counts of narrow/widen instructions.
1671 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1672 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1673
1674 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1675 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1676 : RISCV::VFNCVT_F_F_W;
1678 for (; SrcEltSize != DstEltSize;) {
1679 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1680 ? MVT::getIntegerVT(DstEltSize)
1681 : MVT::getFloatingPointVT(DstEltSize);
1682 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1683 DstEltSize =
1684 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1685 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1686 }
1687 return Cost;
1688 }
1689 case ISD::FP_TO_SINT:
1690 case ISD::FP_TO_UINT: {
1691 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1692 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1693 unsigned FWCVT =
1694 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1695 unsigned FNCVT =
1696 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1697 unsigned SrcEltSize = Src->getScalarSizeInBits();
1698 unsigned DstEltSize = Dst->getScalarSizeInBits();
1700 if ((SrcEltSize == 16) &&
1701 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1702 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1703 // pre-widening to f32 and then convert f32 to integer
1704 VectorType *VecF32Ty =
1705 VectorType::get(Type::getFloatTy(Dst->getContext()),
1706 cast<VectorType>(Dst)->getElementCount());
1707 std::pair<InstructionCost, MVT> VecF32LT =
1708 getTypeLegalizationCost(VecF32Ty);
1709 Cost +=
1710 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1711 VecF32LT.second, CostKind);
1712 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1713 return Cost;
1714 }
1715 if (DstEltSize == SrcEltSize)
1716 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1717 else if (DstEltSize > SrcEltSize)
1718 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1719 else { // (SrcEltSize > DstEltSize)
1720 // First do a narrowing conversion to an integer half the size, then
1721 // truncate if needed.
1722 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1723 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1724 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1725 if ((SrcEltSize / 2) > DstEltSize) {
1726 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1727 Cost +=
1728 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1729 }
1730 }
1731 return Cost;
1732 }
1733 case ISD::SINT_TO_FP:
1734 case ISD::UINT_TO_FP: {
1735 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1736 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1737 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1738 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1739 unsigned SrcEltSize = Src->getScalarSizeInBits();
1740 unsigned DstEltSize = Dst->getScalarSizeInBits();
1741
1743 if ((DstEltSize == 16) &&
1744 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1745 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1746 // it is converted to f32 and then converted to f16
1747 VectorType *VecF32Ty =
1748 VectorType::get(Type::getFloatTy(Dst->getContext()),
1749 cast<VectorType>(Dst)->getElementCount());
1750 std::pair<InstructionCost, MVT> VecF32LT =
1751 getTypeLegalizationCost(VecF32Ty);
1752 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1753 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1754 DstLT.second, CostKind);
1755 return Cost;
1756 }
1757
1758 if (DstEltSize == SrcEltSize)
1759 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1760 else if (DstEltSize > SrcEltSize) {
1761 if ((DstEltSize / 2) > SrcEltSize) {
1762 VectorType *VecTy =
1763 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1764 cast<VectorType>(Dst)->getElementCount());
1765 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1766 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1767 }
1768 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1769 } else
1770 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1771 return Cost;
1772 }
1773 }
1774 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1775}
1776
1777unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
1778 if (isa<ScalableVectorType>(Ty)) {
1779 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1780 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1781 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1782 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1783 }
1784 return cast<FixedVectorType>(Ty)->getNumElements();
1785}
1786
1789 FastMathFlags FMF,
1791 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1792 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1793
1794 // Skip if scalar size of Ty is bigger than ELEN.
1795 if (Ty->getScalarSizeInBits() > ST->getELen())
1796 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1797
1798 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1799 if (Ty->getElementType()->isIntegerTy(1)) {
1800 // SelectionDAGBuilder does following transforms:
1801 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1802 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1803 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1804 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1805 else
1806 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1807 }
1808
1809 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1811 InstructionCost ExtraCost = 0;
1812 switch (IID) {
1813 case Intrinsic::maximum:
1814 if (FMF.noNaNs()) {
1815 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1816 } else {
1817 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1818 RISCV::VFMV_F_S};
1819 // Cost of Canonical Nan + branch
1820 // lui a0, 523264
1821 // fmv.w.x fa0, a0
1822 Type *DstTy = Ty->getScalarType();
1823 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1824 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1825 ExtraCost = 1 +
1826 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1828 getCFInstrCost(Instruction::Br, CostKind);
1829 }
1830 break;
1831
1832 case Intrinsic::minimum:
1833 if (FMF.noNaNs()) {
1834 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1835 } else {
1836 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1837 RISCV::VFMV_F_S};
1838 // Cost of Canonical Nan + branch
1839 // lui a0, 523264
1840 // fmv.w.x fa0, a0
1841 Type *DstTy = Ty->getScalarType();
1842 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1843 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1844 ExtraCost = 1 +
1845 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1847 getCFInstrCost(Instruction::Br, CostKind);
1848 }
1849 break;
1850 }
1851 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1852 }
1853
1854 // IR Reduction is composed by one rvv reduction instruction and vmv
1855 unsigned SplitOp;
1857 switch (IID) {
1858 default:
1859 llvm_unreachable("Unsupported intrinsic");
1860 case Intrinsic::smax:
1861 SplitOp = RISCV::VMAX_VV;
1862 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1863 break;
1864 case Intrinsic::smin:
1865 SplitOp = RISCV::VMIN_VV;
1866 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1867 break;
1868 case Intrinsic::umax:
1869 SplitOp = RISCV::VMAXU_VV;
1870 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1871 break;
1872 case Intrinsic::umin:
1873 SplitOp = RISCV::VMINU_VV;
1874 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1875 break;
1876 case Intrinsic::maxnum:
1877 SplitOp = RISCV::VFMAX_VV;
1878 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1879 break;
1880 case Intrinsic::minnum:
1881 SplitOp = RISCV::VFMIN_VV;
1882 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1883 break;
1884 }
1885 // Add a cost for data larger than LMUL8
1886 InstructionCost SplitCost =
1887 (LT.first > 1) ? (LT.first - 1) *
1888 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1889 : 0;
1890 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1891}
1892
1895 std::optional<FastMathFlags> FMF,
1897 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1898 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1899
1900 // Skip if scalar size of Ty is bigger than ELEN.
1901 if (Ty->getScalarSizeInBits() > ST->getELen())
1902 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1903
1904 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1905 assert(ISD && "Invalid opcode");
1906
1907 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1908 ISD != ISD::FADD)
1909 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1910
1911 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1912 Type *ElementTy = Ty->getElementType();
1913 if (ElementTy->isIntegerTy(1)) {
1914 // Example sequences:
1915 // vfirst.m a0, v0
1916 // seqz a0, a0
1917 if (LT.second == MVT::v1i1)
1918 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
1919 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1921
1922 if (ISD == ISD::AND) {
1923 // Example sequences:
1924 // vmand.mm v8, v9, v8 ; needed every time type is split
1925 // vmnot.m v8, v0 ; alias for vmnand
1926 // vcpop.m a0, v8
1927 // seqz a0, a0
1928
1929 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
1930 // For LMUL <= 8, there is no splitting,
1931 // the sequences are vmnot, vcpop and seqz.
1932 // When LMUL > 8 and split = 1,
1933 // the sequences are vmnand, vcpop and seqz.
1934 // When LMUL > 8 and split > 1,
1935 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
1936 return ((LT.first > 2) ? (LT.first - 2) : 0) *
1937 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
1938 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
1939 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1940 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1942 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
1943 // Example sequences:
1944 // vsetvli a0, zero, e8, mf8, ta, ma
1945 // vmxor.mm v8, v0, v8 ; needed every time type is split
1946 // vcpop.m a0, v8
1947 // andi a0, a0, 1
1948 return (LT.first - 1) *
1949 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
1950 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
1951 } else {
1952 assert(ISD == ISD::OR);
1953 // Example sequences:
1954 // vsetvli a0, zero, e8, mf8, ta, ma
1955 // vmor.mm v8, v9, v8 ; needed every time type is split
1956 // vcpop.m a0, v0
1957 // snez a0, a0
1958 return (LT.first - 1) *
1959 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
1960 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1961 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1963 }
1964 }
1965
1966 // IR Reduction of or/and is composed by one vmv and one rvv reduction
1967 // instruction, and others is composed by two vmv and one rvv reduction
1968 // instruction
1969 unsigned SplitOp;
1971 switch (ISD) {
1972 case ISD::ADD:
1973 SplitOp = RISCV::VADD_VV;
1974 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1975 break;
1976 case ISD::OR:
1977 SplitOp = RISCV::VOR_VV;
1978 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
1979 break;
1980 case ISD::XOR:
1981 SplitOp = RISCV::VXOR_VV;
1982 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1983 break;
1984 case ISD::AND:
1985 SplitOp = RISCV::VAND_VV;
1986 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
1987 break;
1988 case ISD::FADD:
1989 // We can't promote f16/bf16 fadd reductions.
1990 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
1991 LT.second.getScalarType() == MVT::bf16)
1992 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1994 Opcodes.push_back(RISCV::VFMV_S_F);
1995 for (unsigned i = 0; i < LT.first.getValue(); i++)
1996 Opcodes.push_back(RISCV::VFREDOSUM_VS);
1997 Opcodes.push_back(RISCV::VFMV_F_S);
1998 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1999 }
2000 SplitOp = RISCV::VFADD_VV;
2001 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2002 break;
2003 }
2004 // Add a cost for data larger than LMUL8
2005 InstructionCost SplitCost =
2006 (LT.first > 1) ? (LT.first - 1) *
2007 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2008 : 0;
2009 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2010}
2011
2013 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2014 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2015 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2016 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2017 FMF, CostKind);
2018
2019 // Skip if scalar size of ResTy is bigger than ELEN.
2020 if (ResTy->getScalarSizeInBits() > ST->getELen())
2021 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2022 FMF, CostKind);
2023
2024 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2025 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2026 FMF, CostKind);
2027
2028 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2029
2030 if (IsUnsigned && Opcode == Instruction::Add &&
2031 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2032 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2033 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2034 return LT.first *
2035 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2036 }
2037
2038 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2039 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2040 FMF, CostKind);
2041
2042 return (LT.first - 1) +
2043 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2044}
2045
2049 assert(OpInfo.isConstant() && "non constant operand?");
2050 if (!isa<VectorType>(Ty))
2051 // FIXME: We need to account for immediate materialization here, but doing
2052 // a decent job requires more knowledge about the immediate than we
2053 // currently have here.
2054 return 0;
2055
2056 if (OpInfo.isUniform())
2057 // vmv.v.i, vmv.v.x, or vfmv.v.f
2058 // We ignore the cost of the scalar constant materialization to be consistent
2059 // with how we treat scalar constants themselves just above.
2060 return 1;
2061
2062 return getConstantPoolLoadCost(Ty, CostKind);
2063}
2064
2066 Align Alignment,
2067 unsigned AddressSpace,
2069 TTI::OperandValueInfo OpInfo,
2070 const Instruction *I) const {
2071 EVT VT = TLI->getValueType(DL, Src, true);
2072 // Type legalization can't handle structs
2073 if (VT == MVT::Other)
2074 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2075 CostKind, OpInfo, I);
2076
2078 if (Opcode == Instruction::Store && OpInfo.isConstant())
2079 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2080
2081 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2082
2083 InstructionCost BaseCost = [&]() {
2084 InstructionCost Cost = LT.first;
2086 return Cost;
2087
2088 // Our actual lowering for the case where a wider legal type is available
2089 // uses the a VL predicated load on the wider type. This is reflected in
2090 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2091 // widened cases are scalarized.
2092 const DataLayout &DL = this->getDataLayout();
2093 if (Src->isVectorTy() && LT.second.isVector() &&
2095 LT.second.getSizeInBits()))
2096 return Cost;
2097
2098 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2099 CostKind, OpInfo, I);
2100 }();
2101
2102 // Assume memory ops cost scale with the number of vector registers
2103 // possible accessed by the instruction. Note that BasicTTI already
2104 // handles the LT.first term for us.
2105 if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
2106 BaseCost *= TLI->getLMULCost(LT.second);
2107 return Cost + BaseCost;
2108}
2109
2111 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2113 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2115 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2116 Op1Info, Op2Info, I);
2117
2118 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2119 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2120 Op1Info, Op2Info, I);
2121
2122 // Skip if scalar size of ValTy is bigger than ELEN.
2123 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2124 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2125 Op1Info, Op2Info, I);
2126
2127 auto GetConstantMatCost =
2128 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2129 if (OpInfo.isUniform())
2130 // We return 0 we currently ignore the cost of materializing scalar
2131 // constants in GPRs.
2132 return 0;
2133
2134 return getConstantPoolLoadCost(ValTy, CostKind);
2135 };
2136
2137 InstructionCost ConstantMatCost;
2138 if (Op1Info.isConstant())
2139 ConstantMatCost += GetConstantMatCost(Op1Info);
2140 if (Op2Info.isConstant())
2141 ConstantMatCost += GetConstantMatCost(Op2Info);
2142
2143 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2144 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2145 if (CondTy->isVectorTy()) {
2146 if (ValTy->getScalarSizeInBits() == 1) {
2147 // vmandn.mm v8, v8, v9
2148 // vmand.mm v9, v0, v9
2149 // vmor.mm v0, v9, v8
2150 return ConstantMatCost +
2151 LT.first *
2152 getRISCVInstructionCost(
2153 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2154 LT.second, CostKind);
2155 }
2156 // vselect and max/min are supported natively.
2157 return ConstantMatCost +
2158 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2159 CostKind);
2160 }
2161
2162 if (ValTy->getScalarSizeInBits() == 1) {
2163 // vmv.v.x v9, a0
2164 // vmsne.vi v9, v9, 0
2165 // vmandn.mm v8, v8, v9
2166 // vmand.mm v9, v0, v9
2167 // vmor.mm v0, v9, v8
2168 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2169 return ConstantMatCost +
2170 LT.first *
2171 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2172 InterimVT, CostKind) +
2173 LT.first * getRISCVInstructionCost(
2174 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2175 LT.second, CostKind);
2176 }
2177
2178 // vmv.v.x v10, a0
2179 // vmsne.vi v0, v10, 0
2180 // vmerge.vvm v8, v9, v8, v0
2181 return ConstantMatCost +
2182 LT.first * getRISCVInstructionCost(
2183 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2184 LT.second, CostKind);
2185 }
2186
2187 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2188 CmpInst::isIntPredicate(VecPred)) {
2189 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2190 // provided they incur the same cost across all implementations
2191 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2192 LT.second,
2193 CostKind);
2194 }
2195
2196 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2197 CmpInst::isFPPredicate(VecPred)) {
2198
2199 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2200 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2201 return ConstantMatCost +
2202 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2203
2204 // If we do not support the input floating point vector type, use the base
2205 // one which will calculate as:
2206 // ScalarizeCost + Num * Cost for fixed vector,
2207 // InvalidCost for scalable vector.
2208 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2209 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2210 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2211 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2212 Op1Info, Op2Info, I);
2213
2214 // Assuming vector fp compare and mask instructions are all the same cost
2215 // until a need arises to differentiate them.
2216 switch (VecPred) {
2217 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2218 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2219 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2220 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2221 return ConstantMatCost +
2222 LT.first * getRISCVInstructionCost(
2223 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2224 LT.second, CostKind);
2225
2226 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2227 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2228 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2229 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2230 return ConstantMatCost +
2231 LT.first *
2232 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2233 LT.second, CostKind);
2234
2235 case CmpInst::FCMP_OEQ: // vmfeq.vv
2236 case CmpInst::FCMP_OGT: // vmflt.vv
2237 case CmpInst::FCMP_OGE: // vmfle.vv
2238 case CmpInst::FCMP_OLT: // vmflt.vv
2239 case CmpInst::FCMP_OLE: // vmfle.vv
2240 case CmpInst::FCMP_UNE: // vmfne.vv
2241 return ConstantMatCost +
2242 LT.first *
2243 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2244 default:
2245 break;
2246 }
2247 }
2248
2249 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2250 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2251 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2252 // be (0 + select instr cost).
2253 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2254 ValTy->isIntegerTy() && !I->user_empty()) {
2255 if (all_of(I->users(), [&](const User *U) {
2256 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2257 U->getType()->isIntegerTy() &&
2258 !isa<ConstantData>(U->getOperand(1)) &&
2259 !isa<ConstantData>(U->getOperand(2));
2260 }))
2261 return 0;
2262 }
2263
2264 // TODO: Add cost for scalar type.
2265
2266 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2267 Op1Info, Op2Info, I);
2268}
2269
2272 const Instruction *I) const {
2274 return Opcode == Instruction::PHI ? 0 : 1;
2275 // Branches are assumed to be predicted.
2276 return 0;
2277}
2278
2281 unsigned Index,
2282 const Value *Op0,
2283 const Value *Op1) const {
2284 assert(Val->isVectorTy() && "This must be a vector type");
2285
2286 if (Opcode != Instruction::ExtractElement &&
2287 Opcode != Instruction::InsertElement)
2288 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
2289
2290 // Legalize the type.
2291 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2292
2293 // This type is legalized to a scalar type.
2294 if (!LT.second.isVector()) {
2295 auto *FixedVecTy = cast<FixedVectorType>(Val);
2296 // If Index is a known constant, cost is zero.
2297 if (Index != -1U)
2298 return 0;
2299 // Extract/InsertElement with non-constant index is very costly when
2300 // scalarized; estimate cost of loads/stores sequence via the stack:
2301 // ExtractElement cost: store vector to stack, load scalar;
2302 // InsertElement cost: store vector to stack, store scalar, load vector.
2303 Type *ElemTy = FixedVecTy->getElementType();
2304 auto NumElems = FixedVecTy->getNumElements();
2305 auto Align = DL.getPrefTypeAlign(ElemTy);
2306 InstructionCost LoadCost =
2307 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2308 InstructionCost StoreCost =
2309 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2310 return Opcode == Instruction::ExtractElement
2311 ? StoreCost * NumElems + LoadCost
2312 : (StoreCost + LoadCost) * NumElems + StoreCost;
2313 }
2314
2315 // For unsupported scalable vector.
2316 if (LT.second.isScalableVector() && !LT.first.isValid())
2317 return LT.first;
2318
2319 // Mask vector extract/insert is expanded via e8.
2320 if (Val->getScalarSizeInBits() == 1) {
2321 VectorType *WideTy =
2323 cast<VectorType>(Val)->getElementCount());
2324 if (Opcode == Instruction::ExtractElement) {
2325 InstructionCost ExtendCost
2326 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2328 InstructionCost ExtractCost
2329 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2330 return ExtendCost + ExtractCost;
2331 }
2332 InstructionCost ExtendCost
2333 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2335 InstructionCost InsertCost
2336 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2337 InstructionCost TruncCost
2338 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2340 return ExtendCost + InsertCost + TruncCost;
2341 }
2342
2343
2344 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2345 // and vslideup + vmv.s.x to insert element to vector.
2346 unsigned BaseCost = 1;
2347 // When insertelement we should add the index with 1 as the input of vslideup.
2348 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2349
2350 if (Index != -1U) {
2351 // The type may be split. For fixed-width vectors we can normalize the
2352 // index to the new type.
2353 if (LT.second.isFixedLengthVector()) {
2354 unsigned Width = LT.second.getVectorNumElements();
2355 Index = Index % Width;
2356 }
2357
2358 // If exact VLEN is known, we will insert/extract into the appropriate
2359 // subvector with no additional subvector insert/extract cost.
2360 if (auto VLEN = ST->getRealVLen()) {
2361 unsigned EltSize = LT.second.getScalarSizeInBits();
2362 unsigned M1Max = *VLEN / EltSize;
2363 Index = Index % M1Max;
2364 }
2365
2366 if (Index == 0)
2367 // We can extract/insert the first element without vslidedown/vslideup.
2368 SlideCost = 0;
2369 else if (ST->hasVendorXRivosVisni() && isUInt<5>(Index) &&
2370 Val->getScalarType()->isIntegerTy())
2371 SlideCost = 0; // With ri.vinsert/ri.vextract there is no slide needed
2372 else if (Opcode == Instruction::InsertElement)
2373 SlideCost = 1; // With a constant index, we do not need to use addi.
2374 }
2375
2376 // When the vector needs to split into multiple register groups and the index
2377 // exceeds single vector register group, we need to insert/extract the element
2378 // via stack.
2379 if (LT.first > 1 &&
2380 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2381 LT.second.isScalableVector()))) {
2382 Type *ScalarType = Val->getScalarType();
2383 Align VecAlign = DL.getPrefTypeAlign(Val);
2384 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2385 // Extra addi for unknown index.
2386 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2387
2388 // Store all split vectors into stack and load the target element.
2389 if (Opcode == Instruction::ExtractElement)
2390 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2391 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2392 CostKind) +
2393 IdxCost;
2394
2395 // Store all split vectors into stack and store the target element and load
2396 // vectors back.
2397 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2398 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2399 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2400 CostKind) +
2401 IdxCost;
2402 }
2403
2404 // Extract i64 in the target that has XLEN=32 need more instruction.
2405 if (Val->getScalarType()->isIntegerTy() &&
2406 ST->getXLen() < Val->getScalarSizeInBits()) {
2407 // For extractelement, we need the following instructions:
2408 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2409 // vslidedown.vx v8, v8, a0
2410 // vmv.x.s a0, v8
2411 // li a1, 32
2412 // vsrl.vx v8, v8, a1
2413 // vmv.x.s a1, v8
2414
2415 // For insertelement, we need the following instructions:
2416 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2417 // vmv.v.i v12, 0
2418 // vslide1up.vx v16, v12, a1
2419 // vslide1up.vx v12, v16, a0
2420 // addi a0, a2, 1
2421 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2422 // vslideup.vx v8, v12, a2
2423
2424 // TODO: should we count these special vsetvlis?
2425 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2426 }
2427 return BaseCost + SlideCost;
2428}
2429
2433 unsigned Index) const {
2434 if (isa<FixedVectorType>(Val))
2436 Index);
2437
2438 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2439 // for the cost of extracting the last lane of a scalable vector. It probably
2440 // needs a more accurate cost.
2441 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2442 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2443 return getVectorInstrCost(Opcode, Val, CostKind,
2444 EC.getKnownMinValue() - 1 - Index, nullptr,
2445 nullptr);
2446}
2447
2449 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2451 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2452
2453 // TODO: Handle more cost kinds.
2455 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2456 Args, CxtI);
2457
2458 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2459 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2460 Args, CxtI);
2461
2462 // Skip if scalar size of Ty is bigger than ELEN.
2463 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2464 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2465 Args, CxtI);
2466
2467 // Legalize the type.
2468 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2469
2470 // TODO: Handle scalar type.
2471 if (!LT.second.isVector())
2472 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2473 Args, CxtI);
2474
2475 // f16 with zvfhmin and bf16 will be promoted to f32.
2476 // FIXME: nxv32[b]f16 will be custom lowered and split.
2477 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2478 InstructionCost CastCost = 0;
2479 if ((LT.second.getVectorElementType() == MVT::f16 ||
2480 LT.second.getVectorElementType() == MVT::bf16) &&
2481 TLI->getOperationAction(ISDOpcode, LT.second) ==
2483 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2484 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2485 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2486 // Add cost of extending arguments
2487 CastCost += LT.first * Args.size() *
2488 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2490 // Add cost of truncating result
2491 CastCost +=
2492 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2494 // Compute cost of op in promoted type
2495 LT.second = PromotedVT;
2496 }
2497
2498 auto getConstantMatCost =
2499 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2500 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2501 // Two sub-cases:
2502 // * Has a 5 bit immediate operand which can be splatted.
2503 // * Has a larger immediate which must be materialized in scalar register
2504 // We return 0 for both as we currently ignore the cost of materializing
2505 // scalar constants in GPRs.
2506 return 0;
2507
2508 return getConstantPoolLoadCost(Ty, CostKind);
2509 };
2510
2511 // Add the cost of materializing any constant vectors required.
2512 InstructionCost ConstantMatCost = 0;
2513 if (Op1Info.isConstant())
2514 ConstantMatCost += getConstantMatCost(0, Op1Info);
2515 if (Op2Info.isConstant())
2516 ConstantMatCost += getConstantMatCost(1, Op2Info);
2517
2518 unsigned Op;
2519 switch (ISDOpcode) {
2520 case ISD::ADD:
2521 case ISD::SUB:
2522 Op = RISCV::VADD_VV;
2523 break;
2524 case ISD::SHL:
2525 case ISD::SRL:
2526 case ISD::SRA:
2527 Op = RISCV::VSLL_VV;
2528 break;
2529 case ISD::AND:
2530 case ISD::OR:
2531 case ISD::XOR:
2532 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2533 break;
2534 case ISD::MUL:
2535 case ISD::MULHS:
2536 case ISD::MULHU:
2537 Op = RISCV::VMUL_VV;
2538 break;
2539 case ISD::SDIV:
2540 case ISD::UDIV:
2541 Op = RISCV::VDIV_VV;
2542 break;
2543 case ISD::SREM:
2544 case ISD::UREM:
2545 Op = RISCV::VREM_VV;
2546 break;
2547 case ISD::FADD:
2548 case ISD::FSUB:
2549 Op = RISCV::VFADD_VV;
2550 break;
2551 case ISD::FMUL:
2552 Op = RISCV::VFMUL_VV;
2553 break;
2554 case ISD::FDIV:
2555 Op = RISCV::VFDIV_VV;
2556 break;
2557 case ISD::FNEG:
2558 Op = RISCV::VFSGNJN_VV;
2559 break;
2560 default:
2561 // Assuming all other instructions have the same cost until a need arises to
2562 // differentiate them.
2563 return CastCost + ConstantMatCost +
2564 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2565 Args, CxtI);
2566 }
2567
2568 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2569 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2570 // ops are twice as expensive as integer ops. Do the same for vectors so
2571 // scalar floating point ops aren't cheaper than their vector equivalents.
2572 if (Ty->isFPOrFPVectorTy())
2573 InstrCost *= 2;
2574 return CastCost + ConstantMatCost + LT.first * InstrCost;
2575}
2576
2577// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2579 ArrayRef<const Value *> Ptrs, const Value *Base,
2580 const TTI::PointersChainInfo &Info, Type *AccessTy,
2583 // In the basic model we take into account GEP instructions only
2584 // (although here can come alloca instruction, a value, constants and/or
2585 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2586 // pointer). Typically, if Base is a not a GEP-instruction and all the
2587 // pointers are relative to the same base address, all the rest are
2588 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2589 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2590 // any their index is a non-const.
2591 // If no known dependencies between the pointers cost is calculated as a sum
2592 // of costs of GEP instructions.
2593 for (auto [I, V] : enumerate(Ptrs)) {
2594 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2595 if (!GEP)
2596 continue;
2597 if (Info.isSameBase() && V != Base) {
2598 if (GEP->hasAllConstantIndices())
2599 continue;
2600 // If the chain is unit-stride and BaseReg + stride*i is a legal
2601 // addressing mode, then presume the base GEP is sitting around in a
2602 // register somewhere and check if we can fold the offset relative to
2603 // it.
2604 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2605 if (Info.isUnitStride() &&
2606 isLegalAddressingMode(AccessTy,
2607 /* BaseGV */ nullptr,
2608 /* BaseOffset */ Stride * I,
2609 /* HasBaseReg */ true,
2610 /* Scale */ 0,
2611 GEP->getType()->getPointerAddressSpace()))
2612 continue;
2613 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2614 {TTI::OK_AnyValue, TTI::OP_None},
2615 {TTI::OK_AnyValue, TTI::OP_None}, {});
2616 } else {
2617 SmallVector<const Value *> Indices(GEP->indices());
2618 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2619 Indices, AccessTy, CostKind);
2620 }
2621 }
2622 return Cost;
2623}
2624
2627 OptimizationRemarkEmitter *ORE) const {
2628 // TODO: More tuning on benchmarks and metrics with changes as needed
2629 // would apply to all settings below to enable performance.
2630
2631
2632 if (ST->enableDefaultUnroll())
2633 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2634
2635 // Enable Upper bound unrolling universally, not dependent upon the conditions
2636 // below.
2637 UP.UpperBound = true;
2638
2639 // Disable loop unrolling for Oz and Os.
2640 UP.OptSizeThreshold = 0;
2642 if (L->getHeader()->getParent()->hasOptSize())
2643 return;
2644
2645 SmallVector<BasicBlock *, 4> ExitingBlocks;
2646 L->getExitingBlocks(ExitingBlocks);
2647 LLVM_DEBUG(dbgs() << "Loop has:\n"
2648 << "Blocks: " << L->getNumBlocks() << "\n"
2649 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2650
2651 // Only allow another exit other than the latch. This acts as an early exit
2652 // as it mirrors the profitability calculation of the runtime unroller.
2653 if (ExitingBlocks.size() > 2)
2654 return;
2655
2656 // Limit the CFG of the loop body for targets with a branch predictor.
2657 // Allowing 4 blocks permits if-then-else diamonds in the body.
2658 if (L->getNumBlocks() > 4)
2659 return;
2660
2661 // Scan the loop: don't unroll loops with calls as this could prevent
2662 // inlining. Don't unroll auto-vectorized loops either, though do allow
2663 // unrolling of the scalar remainder.
2664 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2666 for (auto *BB : L->getBlocks()) {
2667 for (auto &I : *BB) {
2668 // Both auto-vectorized loops and the scalar remainder have the
2669 // isvectorized attribute, so differentiate between them by the presence
2670 // of vector instructions.
2671 if (IsVectorized && I.getType()->isVectorTy())
2672 return;
2673
2674 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2675 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2676 if (!isLoweredToCall(F))
2677 continue;
2678 }
2679 return;
2680 }
2681
2682 SmallVector<const Value *> Operands(I.operand_values());
2685 }
2686 }
2687
2688 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2689
2690 UP.Partial = true;
2691 UP.Runtime = true;
2692 UP.UnrollRemainder = true;
2693 UP.UnrollAndJam = true;
2694
2695 // Force unrolling small loops can be very useful because of the branch
2696 // taken cost of the backedge.
2697 if (Cost < 12)
2698 UP.Force = true;
2699}
2700
2702 TTI::PeelingPreferences &PP) const {
2704}
2705
2707 if (Ty->isVectorTy()) {
2708 // f16 with only zvfhmin and bf16 will be promoted to f32
2709 Type *EltTy = cast<VectorType>(Ty)->getElementType();
2710 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
2711 EltTy->isBFloatTy())
2713 cast<VectorType>(Ty));
2714
2716 if (Size.isScalable() && ST->hasVInstructions())
2717 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
2718
2720 return divideCeil(Size, ST->getRealMinVLen());
2721 }
2722
2723 return BaseT::getRegUsageForType(Ty);
2724}
2725
2726unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2727 if (SLPMaxVF.getNumOccurrences())
2728 return SLPMaxVF;
2729
2730 // Return how many elements can fit in getRegisterBitwidth. This is the
2731 // same routine as used in LoopVectorizer. We should probably be
2732 // accounting for whether we actually have instructions with the right
2733 // lane type, but we don't have enough information to do that without
2734 // some additional plumbing which hasn't been justified yet.
2735 TypeSize RegWidth =
2737 // If no vector registers, or absurd element widths, disable
2738 // vectorization by returning 1.
2739 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
2740}
2741
2743 return RVVMinTripCount;
2744}
2745
2747 return ST->enableUnalignedVectorMem();
2748}
2749
2752 ScalarEvolution *SE) const {
2753 if (ST->hasVendorXCVmem() && !ST->is64Bit())
2754 return TTI::AMK_PostIndexed;
2755
2757}
2758
2760 const TargetTransformInfo::LSRCost &C2) const {
2761 // RISC-V specific here are "instruction number 1st priority".
2762 // If we need to emit adds inside the loop to add up base registers, then
2763 // we need at least one extra temporary register.
2764 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
2765 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
2766 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
2767 C1.NumIVMuls, C1.NumBaseAdds,
2768 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
2769 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
2770 C2.NumIVMuls, C2.NumBaseAdds,
2771 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
2772}
2773
2775 Align Alignment) const {
2776 auto *VTy = dyn_cast<VectorType>(DataTy);
2777 if (!VTy || VTy->isScalableTy())
2778 return false;
2779
2780 if (!isLegalMaskedLoadStore(DataTy, Alignment))
2781 return false;
2782
2783 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
2784 // scalarize these types with LMUL >= maximum fixed-length LMUL.
2785 if (VTy->getElementType()->isIntegerTy(8))
2786 if (VTy->getElementCount().getFixedValue() > 256)
2787 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
2789 return true;
2790}
2791
2793 Align Alignment) const {
2794 auto *VTy = dyn_cast<VectorType>(DataTy);
2795 if (!VTy || VTy->isScalableTy())
2796 return false;
2797
2798 if (!isLegalMaskedLoadStore(DataTy, Alignment))
2799 return false;
2800 return true;
2801}
2802
2803/// See if \p I should be considered for address type promotion. We check if \p
2804/// I is a sext with right type and used in memory accesses. If it used in a
2805/// "complex" getelementptr, we allow it to be promoted without finding other
2806/// sext instructions that sign extended the same initial value. A getelementptr
2807/// is considered as "complex" if it has more than 2 operands.
2809 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
2810 bool Considerable = false;
2811 AllowPromotionWithoutCommonHeader = false;
2812 if (!isa<SExtInst>(&I))
2813 return false;
2814 Type *ConsideredSExtType =
2815 Type::getInt64Ty(I.getParent()->getParent()->getContext());
2816 if (I.getType() != ConsideredSExtType)
2817 return false;
2818 // See if the sext is the one with the right type and used in at least one
2819 // GetElementPtrInst.
2820 for (const User *U : I.users()) {
2821 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2822 Considerable = true;
2823 // A getelementptr is considered as "complex" if it has more than 2
2824 // operands. We will promote a SExt used in such complex GEP as we
2825 // expect some computation to be merged if they are done on 64 bits.
2826 if (GEPInst->getNumOperands() > 2) {
2827 AllowPromotionWithoutCommonHeader = true;
2828 break;
2829 }
2830 }
2831 }
2832 return Considerable;
2833}
2834
2835bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
2836 switch (Opcode) {
2837 case Instruction::Add:
2838 case Instruction::Sub:
2839 case Instruction::Mul:
2840 case Instruction::And:
2841 case Instruction::Or:
2842 case Instruction::Xor:
2843 case Instruction::FAdd:
2844 case Instruction::FSub:
2845 case Instruction::FMul:
2846 case Instruction::FDiv:
2847 case Instruction::ICmp:
2848 case Instruction::FCmp:
2849 return true;
2850 case Instruction::Shl:
2851 case Instruction::LShr:
2852 case Instruction::AShr:
2853 case Instruction::UDiv:
2854 case Instruction::SDiv:
2855 case Instruction::URem:
2856 case Instruction::SRem:
2857 case Instruction::Select:
2858 return Operand == 1;
2859 default:
2860 return false;
2861 }
2862}
2863
2865 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
2866 return false;
2867
2868 if (canSplatOperand(I->getOpcode(), Operand))
2869 return true;
2870
2871 auto *II = dyn_cast<IntrinsicInst>(I);
2872 if (!II)
2873 return false;
2874
2875 switch (II->getIntrinsicID()) {
2876 case Intrinsic::fma:
2877 case Intrinsic::vp_fma:
2878 case Intrinsic::fmuladd:
2879 case Intrinsic::vp_fmuladd:
2880 return Operand == 0 || Operand == 1;
2881 case Intrinsic::vp_shl:
2882 case Intrinsic::vp_lshr:
2883 case Intrinsic::vp_ashr:
2884 case Intrinsic::vp_udiv:
2885 case Intrinsic::vp_sdiv:
2886 case Intrinsic::vp_urem:
2887 case Intrinsic::vp_srem:
2888 case Intrinsic::ssub_sat:
2889 case Intrinsic::vp_ssub_sat:
2890 case Intrinsic::usub_sat:
2891 case Intrinsic::vp_usub_sat:
2892 case Intrinsic::vp_select:
2893 return Operand == 1;
2894 // These intrinsics are commutative.
2895 case Intrinsic::vp_add:
2896 case Intrinsic::vp_mul:
2897 case Intrinsic::vp_and:
2898 case Intrinsic::vp_or:
2899 case Intrinsic::vp_xor:
2900 case Intrinsic::vp_fadd:
2901 case Intrinsic::vp_fmul:
2902 case Intrinsic::vp_icmp:
2903 case Intrinsic::vp_fcmp:
2904 case Intrinsic::smin:
2905 case Intrinsic::vp_smin:
2906 case Intrinsic::umin:
2907 case Intrinsic::vp_umin:
2908 case Intrinsic::smax:
2909 case Intrinsic::vp_smax:
2910 case Intrinsic::umax:
2911 case Intrinsic::vp_umax:
2912 case Intrinsic::sadd_sat:
2913 case Intrinsic::vp_sadd_sat:
2914 case Intrinsic::uadd_sat:
2915 case Intrinsic::vp_uadd_sat:
2916 // These intrinsics have 'vr' versions.
2917 case Intrinsic::vp_sub:
2918 case Intrinsic::vp_fsub:
2919 case Intrinsic::vp_fdiv:
2920 return Operand == 0 || Operand == 1;
2921 default:
2922 return false;
2923 }
2924}
2925
2926/// Check if sinking \p I's operands to I's basic block is profitable, because
2927/// the operands can be folded into a target instruction, e.g.
2928/// splats of scalars can fold into vector instructions.
2930 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
2931 using namespace llvm::PatternMatch;
2932
2933 if (I->isBitwiseLogicOp()) {
2934 if (!I->getType()->isVectorTy()) {
2935 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
2936 for (auto &Op : I->operands()) {
2937 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
2938 if (match(Op.get(), m_Not(m_Value()))) {
2939 Ops.push_back(&Op);
2940 return true;
2941 }
2942 }
2943 }
2944 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
2945 for (auto &Op : I->operands()) {
2946 // (and X, (not Y)) -> (vandn.vv X, Y)
2947 if (match(Op.get(), m_Not(m_Value()))) {
2948 Ops.push_back(&Op);
2949 return true;
2950 }
2951 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
2953 m_ZeroInt()),
2954 m_Value(), m_ZeroMask()))) {
2955 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
2956 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
2957 Ops.push_back(&Not);
2958 Ops.push_back(&InsertElt);
2959 Ops.push_back(&Op);
2960 return true;
2961 }
2962 }
2963 }
2964 }
2965
2966 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
2967 return false;
2968
2969 // Don't sink splat operands if the target prefers it. Some targets requires
2970 // S2V transfer buffers and we can run out of them copying the same value
2971 // repeatedly.
2972 // FIXME: It could still be worth doing if it would improve vector register
2973 // pressure and prevent a vector spill.
2974 if (!ST->sinkSplatOperands())
2975 return false;
2976
2977 for (auto OpIdx : enumerate(I->operands())) {
2978 if (!canSplatOperand(I, OpIdx.index()))
2979 continue;
2980
2981 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
2982 // Make sure we are not already sinking this operand
2983 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
2984 continue;
2985
2986 // We are looking for a splat/vp.splat that can be sunk.
2987 bool IsVPSplat = match(Op, m_Intrinsic<Intrinsic::experimental_vp_splat>(
2988 m_Value(), m_Value(), m_Value()));
2989 if (!IsVPSplat &&
2991 m_Undef(), m_ZeroMask())))
2992 continue;
2993
2994 // Don't sink i1 splats.
2995 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
2996 continue;
2997
2998 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
2999 // and vector registers
3000 for (Use &U : Op->uses()) {
3001 Instruction *Insn = cast<Instruction>(U.getUser());
3002 if (!canSplatOperand(Insn, U.getOperandNo()))
3003 return false;
3004 }
3005
3006 // Sink any fpexts since they might be used in a widening fp pattern.
3007 if (IsVPSplat) {
3008 if (isa<FPExtInst>(Op->getOperand(0)))
3009 Ops.push_back(&Op->getOperandUse(0));
3010 } else {
3011 Use *InsertEltUse = &Op->getOperandUse(0);
3012 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3013 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3014 Ops.push_back(&InsertElt->getOperandUse(1));
3015 Ops.push_back(InsertEltUse);
3016 }
3017 Ops.push_back(&OpIdx.value());
3018 }
3019 return true;
3020}
3021
3023RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3025 // TODO: Enable expansion when unaligned access is not supported after we fix
3026 // issues in ExpandMemcmp.
3027 if (!ST->enableUnalignedScalarMem())
3028 return Options;
3029
3030 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3031 return Options;
3032
3033 Options.AllowOverlappingLoads = true;
3034 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3035 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3036 if (ST->is64Bit()) {
3037 Options.LoadSizes = {8, 4, 2, 1};
3038 Options.AllowedTailExpansions = {3, 5, 6};
3039 } else {
3040 Options.LoadSizes = {4, 2, 1};
3041 Options.AllowedTailExpansions = {3};
3042 }
3043
3044 if (IsZeroCmp && ST->hasVInstructions()) {
3045 unsigned VLenB = ST->getRealMinVLen() / 8;
3046 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3047 // `VLenB * MaxLMUL` so that it fits in a single register group.
3048 unsigned MinSize = ST->getXLen() / 8 + 1;
3049 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3050 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3051 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3052 }
3053 return Options;
3054}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition: Debug.h:119
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
Definition: BasicTTIImpl.h:558
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:888
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
Definition: BasicTTIImpl.h:459
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
Definition: BasicTTIImpl.h:879
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition: BasicTTIImpl.h:702
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition: BasicTTIImpl.h:774
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:997
bool isLegalAddImmediate(int64_t imm) const override
Definition: BasicTTIImpl.h:447
std::optional< unsigned > getVScaleForTuning() const override
Definition: BasicTTIImpl.h:880
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
Definition: BasicTTIImpl.h:553
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:681
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:695
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:707
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:684
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:693
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:682
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:683
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:692
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:686
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:689
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:690
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:685
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:687
@ ICMP_EQ
equal
Definition: InstrTypes.h:699
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:694
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:691
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:680
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:688
bool isFPPredicate() const
Definition: InstrTypes.h:784
bool isIntPredicate() const
Definition: InstrTypes.h:785
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:481
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:842
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:674
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:468
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:846
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:22
bool noNaNs() const
Definition: FMF.h:65
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
unsigned getNumElements() const
Definition: DerivedTypes.h:635
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
Definition: DerivedTypes.h:627
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:949
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:319
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:43
The optimization diagnostic interface.
unsigned getMaxLMULForFixedLengthVectors() const
bool hasVInstructionsF64() const
unsigned getRealMinVLen() const
bool useRVVForFixedLengthVectors() const
bool hasVInstructionsBF16Minimal() const
bool hasVInstructionsF16Minimal() const
unsigned getXLen() const
bool hasConditionalMoveFusion() const
bool hasVInstructionsF16() const
bool hasVInstructions() const
std::optional< unsigned > getRealVLen() const
bool hasOptimizedSegmentLoadStore(unsigned NF) const
unsigned getRealMaxVLen() const
bool hasVInstructionsF32() const
unsigned getELen() const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *Src, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
MVT getContainerForFixedLengthVector(MVT VT) const
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace, const DataLayout &) const
Returns whether or not generating a interleaved load/store intrinsic for this type will be legal.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:283
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
MVT getTypeToPromoteTo(unsigned Op, MVT VT) const
If the action for this operation is to promote, this method returns the ValueType to promote to.
virtual const DataLayout & getDataLayout() const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:349
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:273
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
Value * getOperand(unsigned i) const
Definition: User.h:232
LLVM Value Representation.
Definition: Value.h:75
Base class of all SIMD vector types.
Definition: DerivedTypes.h:430
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:695
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:463
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition: TypeSize.h:184
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:233
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:219
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:169
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:255
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:862
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:826
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:1002
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:832
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:960
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:908
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:838
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:349
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1121
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:157
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:282
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1854
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
InstructionCost Cost
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2107
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:280
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:216
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).