LLVM 22.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
19#include <cmath>
20#include <optional>
21using namespace llvm;
22using namespace llvm::PatternMatch;
23
24#define DEBUG_TYPE "riscvtti"
25
27 "riscv-v-register-bit-width-lmul",
29 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
30 "by autovectorized code. Fractional LMULs are not supported."),
32
34 "riscv-v-slp-max-vf",
36 "Overrides result used for getMaximumVF query which is used "
37 "exclusively by SLP vectorizer."),
39
41 RVVMinTripCount("riscv-v-min-trip-count",
42 cl::desc("Set the lower bound of a trip count to decide on "
43 "vectorization while tail-folding."),
45
47RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
49 // Check if the type is valid for all CostKind
50 if (!VT.isVector())
52 size_t NumInstr = OpCodes.size();
54 return NumInstr;
55 InstructionCost LMULCost = TLI->getLMULCost(VT);
57 return LMULCost * NumInstr;
59 for (auto Op : OpCodes) {
60 switch (Op) {
61 case RISCV::VRGATHER_VI:
62 Cost += TLI->getVRGatherVICost(VT);
63 break;
64 case RISCV::VRGATHER_VV:
65 Cost += TLI->getVRGatherVVCost(VT);
66 break;
67 case RISCV::VSLIDEUP_VI:
68 case RISCV::VSLIDEDOWN_VI:
69 Cost += TLI->getVSlideVICost(VT);
70 break;
71 case RISCV::VSLIDEUP_VX:
72 case RISCV::VSLIDEDOWN_VX:
73 Cost += TLI->getVSlideVXCost(VT);
74 break;
75 case RISCV::VREDMAX_VS:
76 case RISCV::VREDMIN_VS:
77 case RISCV::VREDMAXU_VS:
78 case RISCV::VREDMINU_VS:
79 case RISCV::VREDSUM_VS:
80 case RISCV::VREDAND_VS:
81 case RISCV::VREDOR_VS:
82 case RISCV::VREDXOR_VS:
83 case RISCV::VFREDMAX_VS:
84 case RISCV::VFREDMIN_VS:
85 case RISCV::VFREDUSUM_VS: {
86 unsigned VL = VT.getVectorMinNumElements();
87 if (!VT.isFixedLengthVector())
88 VL *= *getVScaleForTuning();
89 Cost += Log2_32_Ceil(VL);
90 break;
91 }
92 case RISCV::VFREDOSUM_VS: {
93 unsigned VL = VT.getVectorMinNumElements();
94 if (!VT.isFixedLengthVector())
95 VL *= *getVScaleForTuning();
96 Cost += VL;
97 break;
98 }
99 case RISCV::VMV_X_S:
100 case RISCV::VMV_S_X:
101 case RISCV::VFMV_F_S:
102 case RISCV::VFMV_S_F:
103 case RISCV::VMOR_MM:
104 case RISCV::VMXOR_MM:
105 case RISCV::VMAND_MM:
106 case RISCV::VMANDN_MM:
107 case RISCV::VMNAND_MM:
108 case RISCV::VCPOP_M:
109 case RISCV::VFIRST_M:
110 Cost += 1;
111 break;
112 default:
113 Cost += LMULCost;
114 }
115 }
116 return Cost;
117}
118
120 const RISCVSubtarget *ST,
121 const APInt &Imm, Type *Ty,
123 bool FreeZeroes) {
124 assert(Ty->isIntegerTy() &&
125 "getIntImmCost can only estimate cost of materialising integers");
126
127 // We have a Zero register, so 0 is always free.
128 if (Imm == 0)
129 return TTI::TCC_Free;
130
131 // Otherwise, we check how many instructions it will take to materialise.
132 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
133 /*CompressionCost=*/false, FreeZeroes);
134}
135
139 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
140}
141
142// Look for patterns of shift followed by AND that can be turned into a pair of
143// shifts. We won't need to materialize an immediate for the AND so these can
144// be considered free.
145static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
146 uint64_t Mask = Imm.getZExtValue();
147 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
148 if (!BO || !BO->hasOneUse())
149 return false;
150
151 if (BO->getOpcode() != Instruction::Shl)
152 return false;
153
154 if (!isa<ConstantInt>(BO->getOperand(1)))
155 return false;
156
157 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
158 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
159 // is a mask shifted by c2 bits with c3 leading zeros.
160 if (isShiftedMask_64(Mask)) {
161 unsigned Trailing = llvm::countr_zero(Mask);
162 if (ShAmt == Trailing)
163 return true;
164 }
165
166 return false;
167}
168
170 const APInt &Imm, Type *Ty,
172 Instruction *Inst) const {
173 assert(Ty->isIntegerTy() &&
174 "getIntImmCost can only estimate cost of materialising integers");
175
176 // We have a Zero register, so 0 is always free.
177 if (Imm == 0)
178 return TTI::TCC_Free;
179
180 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
181 // commutative, in others the immediate comes from a specific argument index.
182 bool Takes12BitImm = false;
183 unsigned ImmArgIdx = ~0U;
184
185 switch (Opcode) {
186 case Instruction::GetElementPtr:
187 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
188 // split up large offsets in GEP into better parts than ConstantHoisting
189 // can.
190 return TTI::TCC_Free;
191 case Instruction::Store: {
192 // Use the materialization cost regardless of if it's the address or the
193 // value that is constant, except for if the store is misaligned and
194 // misaligned accesses are not legal (experience shows constant hoisting
195 // can sometimes be harmful in such cases).
196 if (Idx == 1 || !Inst)
197 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
198 /*FreeZeroes=*/true);
199
200 StoreInst *ST = cast<StoreInst>(Inst);
201 if (!getTLI()->allowsMemoryAccessForAlignment(
202 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
203 ST->getPointerAddressSpace(), ST->getAlign()))
204 return TTI::TCC_Free;
205
206 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
207 /*FreeZeroes=*/true);
208 }
209 case Instruction::Load:
210 // If the address is a constant, use the materialization cost.
211 return getIntImmCost(Imm, Ty, CostKind);
212 case Instruction::And:
213 // zext.h
214 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
215 return TTI::TCC_Free;
216 // zext.w
217 if (Imm == UINT64_C(0xffffffff) &&
218 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
219 return TTI::TCC_Free;
220 // bclri
221 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
222 return TTI::TCC_Free;
223 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
224 canUseShiftPair(Inst, Imm))
225 return TTI::TCC_Free;
226 Takes12BitImm = true;
227 break;
228 case Instruction::Add:
229 Takes12BitImm = true;
230 break;
231 case Instruction::Or:
232 case Instruction::Xor:
233 // bseti/binvi
234 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
235 return TTI::TCC_Free;
236 Takes12BitImm = true;
237 break;
238 case Instruction::Mul:
239 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
240 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
241 return TTI::TCC_Free;
242 // One more or less than a power of 2 can use SLLI+ADD/SUB.
243 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
244 return TTI::TCC_Free;
245 // FIXME: There is no MULI instruction.
246 Takes12BitImm = true;
247 break;
248 case Instruction::Sub:
249 case Instruction::Shl:
250 case Instruction::LShr:
251 case Instruction::AShr:
252 Takes12BitImm = true;
253 ImmArgIdx = 1;
254 break;
255 default:
256 break;
257 }
258
259 if (Takes12BitImm) {
260 // Check immediate is the correct argument...
261 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
262 // ... and fits into the 12-bit immediate.
263 if (Imm.getSignificantBits() <= 64 &&
264 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
265 return TTI::TCC_Free;
266 }
267 }
268
269 // Otherwise, use the full materialisation cost.
270 return getIntImmCost(Imm, Ty, CostKind);
271 }
272
273 // By default, prevent hoisting.
274 return TTI::TCC_Free;
275}
276
279 const APInt &Imm, Type *Ty,
281 // Prevent hoisting in unknown cases.
282 return TTI::TCC_Free;
283}
284
286 return ST->hasVInstructions();
287}
288
290RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
291 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
292 return ST->hasStdExtZbb() || (ST->hasVendorXCVbitmanip() && !ST->is64Bit())
295}
296
298 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
300 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
302
303 // zve32x is broken for partial_reduce_umla, but let's make sure we
304 // don't generate them.
305 if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 ||
306 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
307 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
308 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
310
311 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
312 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
313 // Note: Asuming all vqdot* variants are equal cost
314 return LT.first *
315 getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second, CostKind);
316}
317
319 // Currently, the ExpandReductions pass can't expand scalable-vector
320 // reductions, but we still request expansion as RVV doesn't support certain
321 // reductions and the SelectionDAG can't legalize them either.
322 switch (II->getIntrinsicID()) {
323 default:
324 return false;
325 // These reductions have no equivalent in RVV
326 case Intrinsic::vector_reduce_mul:
327 case Intrinsic::vector_reduce_fmul:
328 return true;
329 }
330}
331
332std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
333 if (ST->hasVInstructions())
335 return BaseT::getMaxVScale();
336}
337
338std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
339 if (ST->hasVInstructions())
340 if (unsigned MinVLen = ST->getRealMinVLen();
341 MinVLen >= RISCV::RVVBitsPerBlock)
342 return MinVLen / RISCV::RVVBitsPerBlock;
344}
345
348 unsigned LMUL =
349 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
350 switch (K) {
352 return TypeSize::getFixed(ST->getXLen());
354 return TypeSize::getFixed(
355 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
358 (ST->hasVInstructions() &&
361 : 0);
362 }
363
364 llvm_unreachable("Unsupported register kind");
365}
366
368RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
370 // Add a cost of address generation + the cost of the load. The address
371 // is expected to be a PC relative offset to a constant pool entry
372 // using auipc/addi.
373 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
374 /*AddressSpace=*/0, CostKind);
375}
376
377static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
378 unsigned Size = Mask.size();
379 if (!isPowerOf2_32(Size))
380 return false;
381 for (unsigned I = 0; I != Size; ++I) {
382 if (static_cast<unsigned>(Mask[I]) == I)
383 continue;
384 if (Mask[I] != 0)
385 return false;
386 if (Size % I != 0)
387 return false;
388 for (unsigned J = I + 1; J != Size; ++J)
389 // Check the pattern is repeated.
390 if (static_cast<unsigned>(Mask[J]) != J % I)
391 return false;
392 SubVectorSize = I;
393 return true;
394 }
395 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
396 return false;
397}
398
400 LLVMContext &C) {
401 assert((DataVT.getScalarSizeInBits() != 8 ||
402 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
403 MVT IndexVT = DataVT.changeTypeToInteger();
404 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
405 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
406 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
407}
408
409/// Attempt to approximate the cost of a shuffle which will require splitting
410/// during legalization. Note that processShuffleMasks is not an exact proxy
411/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
412/// reasonably close upperbound.
414 MVT LegalVT, VectorType *Tp,
415 ArrayRef<int> Mask,
417 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
418 "Expected fixed vector type and non-empty mask");
419 unsigned LegalNumElts = LegalVT.getVectorNumElements();
420 // Number of destination vectors after legalization:
421 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
422 // We are going to permute multiple sources and the result will be in
423 // multiple destinations. Providing an accurate cost only for splits where
424 // the element type remains the same.
425 if (NumOfDests <= 1 ||
427 Tp->getElementType()->getPrimitiveSizeInBits() ||
428 LegalNumElts >= Tp->getElementCount().getFixedValue())
430
431 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
432 unsigned LegalVTSize = LegalVT.getStoreSize();
433 // Number of source vectors after legalization:
434 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
435
436 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
437
438 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
439 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
440 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
441 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
442 assert(NormalizedVF >= Mask.size() &&
443 "Normalized mask expected to be not shorter than original mask.");
444 copy(Mask, NormalizedMask.begin());
446 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
448 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
449 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
450 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
451 return;
452 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
453 .second)
454 return;
457 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
458 SingleOpTy, RegMask, CostKind, 0, nullptr);
459 },
460 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
463 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
464 SingleOpTy, RegMask, CostKind, 0, nullptr);
465 });
466 return Cost;
467}
468
469/// Try to perform better estimation of the permutation.
470/// 1. Split the source/destination vectors into real registers.
471/// 2. Do the mask analysis to identify which real registers are
472/// permuted. If more than 1 source registers are used for the
473/// destination register building, the cost for this destination register
474/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
475/// source register is used, build mask and calculate the cost as a cost
476/// of PermuteSingleSrc.
477/// Also, for the single register permute we try to identify if the
478/// destination register is just a copy of the source register or the
479/// copy of the previous destination register (the cost is
480/// TTI::TCC_Basic). If the source register is just reused, the cost for
481/// this operation is 0.
482static InstructionCost
484 std::optional<unsigned> VLen, VectorType *Tp,
486 assert(LegalVT.isFixedLengthVector());
487 if (!VLen || Mask.empty())
489 MVT ElemVT = LegalVT.getVectorElementType();
490 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
491 LegalVT = TTI.getTypeLegalizationCost(
492 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
493 .second;
494 // Number of destination vectors after legalization:
495 InstructionCost NumOfDests =
496 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
497 if (NumOfDests <= 1 ||
499 Tp->getElementType()->getPrimitiveSizeInBits() ||
500 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
502
503 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
504 unsigned LegalVTSize = LegalVT.getStoreSize();
505 // Number of source vectors after legalization:
506 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
507
508 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
509 LegalVT.getVectorNumElements());
510
511 unsigned E = NumOfDests.getValue();
512 unsigned NormalizedVF =
513 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
514 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
515 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
516 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
517 assert(NormalizedVF >= Mask.size() &&
518 "Normalized mask expected to be not shorter than original mask.");
519 copy(Mask, NormalizedMask.begin());
521 int NumShuffles = 0;
522 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
524 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
525 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
526 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
527 return;
528 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
529 .second)
530 return;
531 ++NumShuffles;
533 SingleOpTy, RegMask, CostKind, 0, nullptr);
534 },
535 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
537 SingleOpTy, RegMask, CostKind, 0, nullptr);
538 NumShuffles += 2;
539 });
540 // Note: check that we do not emit too many shuffles here to prevent code
541 // size explosion.
542 // TODO: investigate, if it can be improved by extra analysis of the masks
543 // to check if the code is more profitable.
544 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
545 (NumOfDestRegs <= 2 && NumShuffles < 4))
546 return Cost;
548}
549
550InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
551 ArrayRef<int> Mask,
553 // Avoid missing masks and length changing shuffles
554 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
556
557 int NumElts = Tp->getNumElements();
558 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
559 // Avoid scalarization cases
560 if (!LT.second.isFixedLengthVector())
562
563 // Requires moving elements between parts, which requires additional
564 // unmodeled instructions.
565 if (LT.first != 1)
567
568 auto GetSlideOpcode = [&](int SlideAmt) {
569 assert(SlideAmt != 0);
570 bool IsVI = isUInt<5>(std::abs(SlideAmt));
571 if (SlideAmt < 0)
572 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
573 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
574 };
575
576 std::array<std::pair<int, int>, 2> SrcInfo;
577 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
579
580 if (SrcInfo[1].second == 0)
581 std::swap(SrcInfo[0], SrcInfo[1]);
582
583 InstructionCost FirstSlideCost = 0;
584 if (SrcInfo[0].second != 0) {
585 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
586 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
587 }
588
589 if (SrcInfo[1].first == -1)
590 return FirstSlideCost;
591
592 InstructionCost SecondSlideCost = 0;
593 if (SrcInfo[1].second != 0) {
594 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
595 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
596 } else {
597 SecondSlideCost =
598 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
599 }
600
601 auto EC = Tp->getElementCount();
602 VectorType *MaskTy =
604 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
605 return FirstSlideCost + SecondSlideCost + MaskCost;
606}
607
610 VectorType *SrcTy, ArrayRef<int> Mask,
611 TTI::TargetCostKind CostKind, int Index,
613 const Instruction *CxtI) const {
614 assert((Mask.empty() || DstTy->isScalableTy() ||
615 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
616 "Expected the Mask to match the return size if given");
617 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
618 "Expected the same scalar types");
619
620 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
621 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
622
623 // First, handle cases where having a fixed length vector enables us to
624 // give a more accurate cost than falling back to generic scalable codegen.
625 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
626 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
627 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
629 *this, LT.second, ST->getRealVLen(),
630 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
631 if (VRegSplittingCost.isValid())
632 return VRegSplittingCost;
633 switch (Kind) {
634 default:
635 break;
637 if (Mask.size() >= 2) {
638 MVT EltTp = LT.second.getVectorElementType();
639 // If the size of the element is < ELEN then shuffles of interleaves and
640 // deinterleaves of 2 vectors can be lowered into the following
641 // sequences
642 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
643 // Example sequence:
644 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
645 // vwaddu.vv v10, v8, v9
646 // li a0, -1 (ignored)
647 // vwmaccu.vx v10, a0, v9
648 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
649 return 2 * LT.first * TLI->getLMULCost(LT.second);
650
651 if (Mask[0] == 0 || Mask[0] == 1) {
652 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
653 // Example sequence:
654 // vnsrl.wi v10, v8, 0
655 if (equal(DeinterleaveMask, Mask))
656 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
657 LT.second, CostKind);
658 }
659 }
660 int SubVectorSize;
661 if (LT.second.getScalarSizeInBits() != 1 &&
662 isRepeatedConcatMask(Mask, SubVectorSize)) {
664 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
665 // The cost of extraction from a subvector is 0 if the index is 0.
666 for (unsigned I = 0; I != NumSlides; ++I) {
667 unsigned InsertIndex = SubVectorSize * (1 << I);
668 FixedVectorType *SubTp =
669 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
670 FixedVectorType *DestTp =
672 std::pair<InstructionCost, MVT> DestLT =
674 // Add the cost of whole vector register move because the
675 // destination vector register group for vslideup cannot overlap the
676 // source.
677 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
678 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
679 CostKind, InsertIndex, SubTp);
680 }
681 return Cost;
682 }
683 }
684
685 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
686 SlideCost.isValid())
687 return SlideCost;
688
689 // vrgather + cost of generating the mask constant.
690 // We model this for an unknown mask with a single vrgather.
691 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
692 LT.second.getVectorNumElements() <= 256)) {
693 VectorType *IdxTy =
694 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
695 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
696 return IndexCost +
697 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
698 }
699 break;
700 }
703
704 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
705 SlideCost.isValid())
706 return SlideCost;
707
708 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
709 // register for the second vrgather. We model this for an unknown
710 // (shuffle) mask.
711 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
712 LT.second.getVectorNumElements() <= 256)) {
713 auto &C = SrcTy->getContext();
714 auto EC = SrcTy->getElementCount();
715 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
717 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
718 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
719 return 2 * IndexCost +
720 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
721 LT.second, CostKind) +
722 MaskCost;
723 }
724 break;
725 }
726 }
727
728 auto shouldSplit = [](TTI::ShuffleKind Kind) {
729 switch (Kind) {
730 default:
731 return false;
735 return true;
736 }
737 };
738
739 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
740 shouldSplit(Kind)) {
741 InstructionCost SplitCost =
742 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
743 if (SplitCost.isValid())
744 return SplitCost;
745 }
746 }
747
748 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
749 switch (Kind) {
750 default:
751 // Fallthrough to generic handling.
752 // TODO: Most of these cases will return getInvalid in generic code, and
753 // must be implemented here.
754 break;
756 // Extract at zero is always a subregister extract
757 if (Index == 0)
758 return TTI::TCC_Free;
759
760 // If we're extracting a subvector of at most m1 size at a sub-register
761 // boundary - which unfortunately we need exact vlen to identify - this is
762 // a subregister extract at worst and thus won't require a vslidedown.
763 // TODO: Extend for aligned m2, m4 subvector extracts
764 // TODO: Extend for misalgined (but contained) extracts
765 // TODO: Extend for scalable subvector types
766 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
767 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
768 if (std::optional<unsigned> VLen = ST->getRealVLen();
769 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
770 SubLT.second.getSizeInBits() <= *VLen)
771 return TTI::TCC_Free;
772 }
773
774 // Example sequence:
775 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
776 // vslidedown.vi v8, v9, 2
777 return LT.first *
778 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
780 // Example sequence:
781 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
782 // vslideup.vi v8, v9, 2
783 LT = getTypeLegalizationCost(DstTy);
784 return LT.first *
785 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
786 case TTI::SK_Select: {
787 // Example sequence:
788 // li a0, 90
789 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
790 // vmv.s.x v0, a0
791 // vmerge.vvm v8, v9, v8, v0
792 // We use 2 for the cost of the mask materialization as this is the true
793 // cost for small masks and most shuffles are small. At worst, this cost
794 // should be a very small constant for the constant pool load. As such,
795 // we may bias towards large selects slightly more than truly warranted.
796 return LT.first *
797 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
798 LT.second, CostKind));
799 }
800 case TTI::SK_Broadcast: {
801 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
802 Instruction::InsertElement);
803 if (LT.second.getScalarSizeInBits() == 1) {
804 if (HasScalar) {
805 // Example sequence:
806 // andi a0, a0, 1
807 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
808 // vmv.v.x v8, a0
809 // vmsne.vi v0, v8, 0
810 return LT.first *
811 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
812 LT.second, CostKind));
813 }
814 // Example sequence:
815 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
816 // vmv.v.i v8, 0
817 // vmerge.vim v8, v8, 1, v0
818 // vmv.x.s a0, v8
819 // andi a0, a0, 1
820 // vmv.v.x v8, a0
821 // vmsne.vi v0, v8, 0
822
823 return LT.first *
824 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
825 RISCV::VMV_X_S, RISCV::VMV_V_X,
826 RISCV::VMSNE_VI},
827 LT.second, CostKind));
828 }
829
830 if (HasScalar) {
831 // Example sequence:
832 // vmv.v.x v8, a0
833 return LT.first *
834 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
835 }
836
837 // Example sequence:
838 // vrgather.vi v9, v8, 0
839 return LT.first *
840 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
841 }
842 case TTI::SK_Splice: {
843 // vslidedown+vslideup.
844 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
845 // of similar code, but I think we expand through memory.
846 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
847 if (Index >= 0 && Index < 32)
848 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
849 else if (Index < 0 && Index > -32)
850 Opcodes[1] = RISCV::VSLIDEUP_VI;
851 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
852 }
853 case TTI::SK_Reverse: {
854
855 if (!LT.second.isVector())
857
858 // TODO: Cases to improve here:
859 // * Illegal vector types
860 // * i64 on RV32
861 if (SrcTy->getElementType()->isIntegerTy(1)) {
862 VectorType *WideTy =
864 cast<VectorType>(SrcTy)->getElementCount());
865 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
867 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
868 nullptr) +
869 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
871 }
872
873 MVT ContainerVT = LT.second;
874 if (LT.second.isFixedLengthVector())
875 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
876 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
877 if (ContainerVT.bitsLE(M1VT)) {
878 // Example sequence:
879 // csrr a0, vlenb
880 // srli a0, a0, 3
881 // addi a0, a0, -1
882 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
883 // vid.v v9
884 // vrsub.vx v10, v9, a0
885 // vrgather.vv v9, v8, v10
886 InstructionCost LenCost = 3;
887 if (LT.second.isFixedLengthVector())
888 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
889 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
890 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
891 if (LT.second.isFixedLengthVector() &&
892 isInt<5>(LT.second.getVectorNumElements() - 1))
893 Opcodes[1] = RISCV::VRSUB_VI;
894 InstructionCost GatherCost =
895 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
896 return LT.first * (LenCost + GatherCost);
897 }
898
899 // At high LMUL, we split into a series of M1 reverses (see
900 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
901 // the resulting gap at the bottom (for fixed vectors only). The important
902 // bit is that the cost scales linearly, not quadratically with LMUL.
903 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
904 InstructionCost FixedCost =
905 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
906 unsigned Ratio =
908 InstructionCost GatherCost =
909 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
910 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
911 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
912 return FixedCost + LT.first * (GatherCost + SlideCost);
913 }
914 }
915 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
916 SubTp);
917}
918
919static unsigned isM1OrSmaller(MVT VT) {
921 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
925}
926
928 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
929 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
930 ArrayRef<Value *> VL) const {
931 if (isa<ScalableVectorType>(Ty))
933
934 // A build_vector (which is m1 sized or smaller) can be done in no
935 // worse than one vslide1down.vx per element in the type. We could
936 // in theory do an explode_vector in the inverse manner, but our
937 // lowering today does not have a first class node for this pattern.
939 Ty, DemandedElts, Insert, Extract, CostKind);
940 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
941 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
942 if (Ty->getScalarSizeInBits() == 1) {
943 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
944 // Note: Implicit scalar anyextend is assumed to be free since the i1
945 // must be stored in a GPR.
946 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
947 CostKind) +
948 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
950 }
951
952 assert(LT.second.isFixedLengthVector());
953 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
954 if (isM1OrSmaller(ContainerVT)) {
955 InstructionCost BV =
956 cast<FixedVectorType>(Ty)->getNumElements() *
957 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
958 if (BV < Cost)
959 Cost = BV;
960 }
961 }
962 return Cost;
963}
964
966RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
967 unsigned AddressSpace,
969 if (!isLegalMaskedLoadStore(Src, Alignment) ||
971 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
972 CostKind);
973
974 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
975}
976
978 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
979 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
980 bool UseMaskForCond, bool UseMaskForGaps) const {
981
982 // The interleaved memory access pass will lower (de)interleave ops combined
983 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
984 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
985 // gap).
986 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
987 auto *VTy = cast<VectorType>(VecTy);
988 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
989 // Need to make sure type has't been scalarized
990 if (LT.second.isVector()) {
991 auto *SubVecTy =
992 VectorType::get(VTy->getElementType(),
993 VTy->getElementCount().divideCoefficientBy(Factor));
994 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
995 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
996 AddressSpace, DL)) {
997
998 // Some processors optimize segment loads/stores as one wide memory op +
999 // Factor * LMUL shuffle ops.
1000 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1002 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1003 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1004 Cost += Factor * TLI->getLMULCost(SubVecVT);
1005 return LT.first * Cost;
1006 }
1007
1008 // Otherwise, the cost is proportional to the number of elements (VL *
1009 // Factor ops).
1010 InstructionCost MemOpCost =
1011 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1012 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1013 unsigned NumLoads = getEstimatedVLFor(VTy);
1014 return NumLoads * MemOpCost;
1015 }
1016 }
1017 }
1018
1019 // TODO: Return the cost of interleaved accesses for scalable vector when
1020 // unable to convert to segment accesses instructions.
1021 if (isa<ScalableVectorType>(VecTy))
1023
1024 auto *FVTy = cast<FixedVectorType>(VecTy);
1025 InstructionCost MemCost =
1026 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1027 unsigned VF = FVTy->getNumElements() / Factor;
1028
1029 // An interleaved load will look like this for Factor=3:
1030 // %wide.vec = load <12 x i32>, ptr %3, align 4
1031 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1032 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1033 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1034 if (Opcode == Instruction::Load) {
1035 InstructionCost Cost = MemCost;
1036 for (unsigned Index : Indices) {
1037 FixedVectorType *VecTy =
1038 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1039 auto Mask = createStrideMask(Index, Factor, VF);
1040 Mask.resize(VF * Factor, -1);
1041 InstructionCost ShuffleCost =
1043 Mask, CostKind, 0, nullptr, {});
1044 Cost += ShuffleCost;
1045 }
1046 return Cost;
1047 }
1048
1049 // TODO: Model for NF > 2
1050 // We'll need to enhance getShuffleCost to model shuffles that are just
1051 // inserts and extracts into subvectors, since they won't have the full cost
1052 // of a vrgather.
1053 // An interleaved store for 3 vectors of 4 lanes will look like
1054 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1055 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1056 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1057 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1058 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1059 if (Factor != 2)
1060 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1061 Alignment, AddressSpace, CostKind,
1062 UseMaskForCond, UseMaskForGaps);
1063
1064 assert(Opcode == Instruction::Store && "Opcode must be a store");
1065 // For an interleaving store of 2 vectors, we perform one large interleaving
1066 // shuffle that goes into the wide store
1067 auto Mask = createInterleaveMask(VF, Factor);
1068 InstructionCost ShuffleCost =
1070 CostKind, 0, nullptr, {});
1071 return MemCost + ShuffleCost;
1072}
1073
1075 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1076 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1078 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1079 Alignment, CostKind, I);
1080
1081 if ((Opcode == Instruction::Load &&
1082 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1083 (Opcode == Instruction::Store &&
1084 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1085 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1086 Alignment, CostKind, I);
1087
1088 // Cost is proportional to the number of memory operations implied. For
1089 // scalable vectors, we use an estimate on that number since we don't
1090 // know exactly what VL will be.
1091 auto &VTy = *cast<VectorType>(DataTy);
1092 InstructionCost MemOpCost =
1093 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1094 {TTI::OK_AnyValue, TTI::OP_None}, I);
1095 unsigned NumLoads = getEstimatedVLFor(&VTy);
1096 return NumLoads * MemOpCost;
1097}
1098
1100 unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
1101 TTI::TargetCostKind CostKind, const Instruction *I) const {
1102 bool IsLegal = (Opcode == Instruction::Store &&
1103 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1104 (Opcode == Instruction::Load &&
1105 isLegalMaskedExpandLoad(DataTy, Alignment));
1106 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1107 return BaseT::getExpandCompressMemoryOpCost(Opcode, DataTy, VariableMask,
1108 Alignment, CostKind, I);
1109 // Example compressstore sequence:
1110 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1111 // vcompress.vm v10, v8, v0
1112 // vcpop.m a1, v0
1113 // vsetvli zero, a1, e32, m2, ta, ma
1114 // vse32.v v10, (a0)
1115 // Example expandload sequence:
1116 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1117 // vcpop.m a1, v0
1118 // vsetvli zero, a1, e32, m2, ta, ma
1119 // vle32.v v10, (a0)
1120 // vsetivli zero, 8, e32, m2, ta, ma
1121 // viota.m v12, v0
1122 // vrgather.vv v8, v10, v12, v0.t
1123 auto MemOpCost =
1124 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1125 auto LT = getTypeLegalizationCost(DataTy);
1126 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1127 if (VariableMask)
1128 Opcodes.push_back(RISCV::VCPOP_M);
1129 if (Opcode == Instruction::Store)
1130 Opcodes.append({RISCV::VCOMPRESS_VM});
1131 else
1132 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1133 return MemOpCost +
1134 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1135}
1136
1138 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1139 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1140 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1141 !isLegalStridedLoadStore(DataTy, Alignment)) ||
1142 (Opcode != Instruction::Load && Opcode != Instruction::Store))
1143 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
1144 Alignment, CostKind, I);
1145
1147 return TTI::TCC_Basic;
1148
1149 // Cost is proportional to the number of memory operations implied. For
1150 // scalable vectors, we use an estimate on that number since we don't
1151 // know exactly what VL will be.
1152 auto &VTy = *cast<VectorType>(DataTy);
1153 InstructionCost MemOpCost =
1154 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1155 {TTI::OK_AnyValue, TTI::OP_None}, I);
1156 unsigned NumLoads = getEstimatedVLFor(&VTy);
1157 return NumLoads * MemOpCost;
1158}
1159
1162 // FIXME: This is a property of the default vector convention, not
1163 // all possible calling conventions. Fixing that will require
1164 // some TTI API and SLP rework.
1167 for (auto *Ty : Tys) {
1168 if (!Ty->isVectorTy())
1169 continue;
1170 Align A = DL.getPrefTypeAlign(Ty);
1171 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1172 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1173 }
1174 return Cost;
1175}
1176
1177// Currently, these represent both throughput and codesize costs
1178// for the respective intrinsics. The costs in this table are simply
1179// instruction counts with the following adjustments made:
1180// * One vsetvli is considered free.
1182 {Intrinsic::floor, MVT::f32, 9},
1183 {Intrinsic::floor, MVT::f64, 9},
1184 {Intrinsic::ceil, MVT::f32, 9},
1185 {Intrinsic::ceil, MVT::f64, 9},
1186 {Intrinsic::trunc, MVT::f32, 7},
1187 {Intrinsic::trunc, MVT::f64, 7},
1188 {Intrinsic::round, MVT::f32, 9},
1189 {Intrinsic::round, MVT::f64, 9},
1190 {Intrinsic::roundeven, MVT::f32, 9},
1191 {Intrinsic::roundeven, MVT::f64, 9},
1192 {Intrinsic::rint, MVT::f32, 7},
1193 {Intrinsic::rint, MVT::f64, 7},
1194 {Intrinsic::nearbyint, MVT::f32, 9},
1195 {Intrinsic::nearbyint, MVT::f64, 9},
1196 {Intrinsic::bswap, MVT::i16, 3},
1197 {Intrinsic::bswap, MVT::i32, 12},
1198 {Intrinsic::bswap, MVT::i64, 31},
1199 {Intrinsic::vp_bswap, MVT::i16, 3},
1200 {Intrinsic::vp_bswap, MVT::i32, 12},
1201 {Intrinsic::vp_bswap, MVT::i64, 31},
1202 {Intrinsic::vp_fshl, MVT::i8, 7},
1203 {Intrinsic::vp_fshl, MVT::i16, 7},
1204 {Intrinsic::vp_fshl, MVT::i32, 7},
1205 {Intrinsic::vp_fshl, MVT::i64, 7},
1206 {Intrinsic::vp_fshr, MVT::i8, 7},
1207 {Intrinsic::vp_fshr, MVT::i16, 7},
1208 {Intrinsic::vp_fshr, MVT::i32, 7},
1209 {Intrinsic::vp_fshr, MVT::i64, 7},
1210 {Intrinsic::bitreverse, MVT::i8, 17},
1211 {Intrinsic::bitreverse, MVT::i16, 24},
1212 {Intrinsic::bitreverse, MVT::i32, 33},
1213 {Intrinsic::bitreverse, MVT::i64, 52},
1214 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1215 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1216 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1217 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1218 {Intrinsic::ctpop, MVT::i8, 12},
1219 {Intrinsic::ctpop, MVT::i16, 19},
1220 {Intrinsic::ctpop, MVT::i32, 20},
1221 {Intrinsic::ctpop, MVT::i64, 21},
1222 {Intrinsic::ctlz, MVT::i8, 19},
1223 {Intrinsic::ctlz, MVT::i16, 28},
1224 {Intrinsic::ctlz, MVT::i32, 31},
1225 {Intrinsic::ctlz, MVT::i64, 35},
1226 {Intrinsic::cttz, MVT::i8, 16},
1227 {Intrinsic::cttz, MVT::i16, 23},
1228 {Intrinsic::cttz, MVT::i32, 24},
1229 {Intrinsic::cttz, MVT::i64, 25},
1230 {Intrinsic::vp_ctpop, MVT::i8, 12},
1231 {Intrinsic::vp_ctpop, MVT::i16, 19},
1232 {Intrinsic::vp_ctpop, MVT::i32, 20},
1233 {Intrinsic::vp_ctpop, MVT::i64, 21},
1234 {Intrinsic::vp_ctlz, MVT::i8, 19},
1235 {Intrinsic::vp_ctlz, MVT::i16, 28},
1236 {Intrinsic::vp_ctlz, MVT::i32, 31},
1237 {Intrinsic::vp_ctlz, MVT::i64, 35},
1238 {Intrinsic::vp_cttz, MVT::i8, 16},
1239 {Intrinsic::vp_cttz, MVT::i16, 23},
1240 {Intrinsic::vp_cttz, MVT::i32, 24},
1241 {Intrinsic::vp_cttz, MVT::i64, 25},
1242};
1243
1247 auto *RetTy = ICA.getReturnType();
1248 switch (ICA.getID()) {
1249 case Intrinsic::lrint:
1250 case Intrinsic::llrint:
1251 case Intrinsic::lround:
1252 case Intrinsic::llround: {
1253 auto LT = getTypeLegalizationCost(RetTy);
1254 Type *SrcTy = ICA.getArgTypes().front();
1255 auto SrcLT = getTypeLegalizationCost(SrcTy);
1256 if (ST->hasVInstructions() && LT.second.isVector()) {
1258 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1259 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1260 if (LT.second.getVectorElementType() == MVT::bf16) {
1261 if (!ST->hasVInstructionsBF16Minimal())
1263 if (DstEltSz == 32)
1264 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1265 else
1266 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1267 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1268 !ST->hasVInstructionsF16()) {
1269 if (!ST->hasVInstructionsF16Minimal())
1271 if (DstEltSz == 32)
1272 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1273 else
1274 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1275
1276 } else if (SrcEltSz > DstEltSz) {
1277 Ops = {RISCV::VFNCVT_X_F_W};
1278 } else if (SrcEltSz < DstEltSz) {
1279 Ops = {RISCV::VFWCVT_X_F_V};
1280 } else {
1281 Ops = {RISCV::VFCVT_X_F_V};
1282 }
1283
1284 // We need to use the source LMUL in the case of a narrowing op, and the
1285 // destination LMUL otherwise.
1286 if (SrcEltSz > DstEltSz)
1287 return SrcLT.first *
1288 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1289 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1290 }
1291 break;
1292 }
1293 case Intrinsic::ceil:
1294 case Intrinsic::floor:
1295 case Intrinsic::trunc:
1296 case Intrinsic::rint:
1297 case Intrinsic::round:
1298 case Intrinsic::roundeven: {
1299 // These all use the same code.
1300 auto LT = getTypeLegalizationCost(RetTy);
1301 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1302 return LT.first * 8;
1303 break;
1304 }
1305 case Intrinsic::umin:
1306 case Intrinsic::umax:
1307 case Intrinsic::smin:
1308 case Intrinsic::smax: {
1309 auto LT = getTypeLegalizationCost(RetTy);
1310 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1311 return LT.first;
1312
1313 if (ST->hasVInstructions() && LT.second.isVector()) {
1314 unsigned Op;
1315 switch (ICA.getID()) {
1316 case Intrinsic::umin:
1317 Op = RISCV::VMINU_VV;
1318 break;
1319 case Intrinsic::umax:
1320 Op = RISCV::VMAXU_VV;
1321 break;
1322 case Intrinsic::smin:
1323 Op = RISCV::VMIN_VV;
1324 break;
1325 case Intrinsic::smax:
1326 Op = RISCV::VMAX_VV;
1327 break;
1328 }
1329 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1330 }
1331 break;
1332 }
1333 case Intrinsic::sadd_sat:
1334 case Intrinsic::ssub_sat:
1335 case Intrinsic::uadd_sat:
1336 case Intrinsic::usub_sat: {
1337 auto LT = getTypeLegalizationCost(RetTy);
1338 if (ST->hasVInstructions() && LT.second.isVector()) {
1339 unsigned Op;
1340 switch (ICA.getID()) {
1341 case Intrinsic::sadd_sat:
1342 Op = RISCV::VSADD_VV;
1343 break;
1344 case Intrinsic::ssub_sat:
1345 Op = RISCV::VSSUBU_VV;
1346 break;
1347 case Intrinsic::uadd_sat:
1348 Op = RISCV::VSADDU_VV;
1349 break;
1350 case Intrinsic::usub_sat:
1351 Op = RISCV::VSSUBU_VV;
1352 break;
1353 }
1354 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1355 }
1356 break;
1357 }
1358 case Intrinsic::fma:
1359 case Intrinsic::fmuladd: {
1360 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1361 auto LT = getTypeLegalizationCost(RetTy);
1362 if (ST->hasVInstructions() && LT.second.isVector())
1363 return LT.first *
1364 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1365 break;
1366 }
1367 case Intrinsic::fabs: {
1368 auto LT = getTypeLegalizationCost(RetTy);
1369 if (ST->hasVInstructions() && LT.second.isVector()) {
1370 // lui a0, 8
1371 // addi a0, a0, -1
1372 // vsetvli a1, zero, e16, m1, ta, ma
1373 // vand.vx v8, v8, a0
1374 // f16 with zvfhmin and bf16 with zvfhbmin
1375 if (LT.second.getVectorElementType() == MVT::bf16 ||
1376 (LT.second.getVectorElementType() == MVT::f16 &&
1377 !ST->hasVInstructionsF16()))
1378 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1379 CostKind) +
1380 2;
1381 else
1382 return LT.first *
1383 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1384 }
1385 break;
1386 }
1387 case Intrinsic::sqrt: {
1388 auto LT = getTypeLegalizationCost(RetTy);
1389 if (ST->hasVInstructions() && LT.second.isVector()) {
1392 MVT ConvType = LT.second;
1393 MVT FsqrtType = LT.second;
1394 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1395 // will be spilt.
1396 if (LT.second.getVectorElementType() == MVT::bf16) {
1397 if (LT.second == MVT::nxv32bf16) {
1398 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1399 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1400 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1401 ConvType = MVT::nxv16f16;
1402 FsqrtType = MVT::nxv16f32;
1403 } else {
1404 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1405 FsqrtOp = {RISCV::VFSQRT_V};
1406 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1407 }
1408 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1409 !ST->hasVInstructionsF16()) {
1410 if (LT.second == MVT::nxv32f16) {
1411 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1412 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1413 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1414 ConvType = MVT::nxv16f16;
1415 FsqrtType = MVT::nxv16f32;
1416 } else {
1417 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1418 FsqrtOp = {RISCV::VFSQRT_V};
1419 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1420 }
1421 } else {
1422 FsqrtOp = {RISCV::VFSQRT_V};
1423 }
1424
1425 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1426 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1427 }
1428 break;
1429 }
1430 case Intrinsic::cttz:
1431 case Intrinsic::ctlz:
1432 case Intrinsic::ctpop: {
1433 auto LT = getTypeLegalizationCost(RetTy);
1434 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1435 unsigned Op;
1436 switch (ICA.getID()) {
1437 case Intrinsic::cttz:
1438 Op = RISCV::VCTZ_V;
1439 break;
1440 case Intrinsic::ctlz:
1441 Op = RISCV::VCLZ_V;
1442 break;
1443 case Intrinsic::ctpop:
1444 Op = RISCV::VCPOP_V;
1445 break;
1446 }
1447 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1448 }
1449 break;
1450 }
1451 case Intrinsic::abs: {
1452 auto LT = getTypeLegalizationCost(RetTy);
1453 if (ST->hasVInstructions() && LT.second.isVector()) {
1454 // vrsub.vi v10, v8, 0
1455 // vmax.vv v8, v8, v10
1456 return LT.first *
1457 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1458 LT.second, CostKind);
1459 }
1460 break;
1461 }
1462 case Intrinsic::get_active_lane_mask: {
1463 if (ST->hasVInstructions()) {
1464 Type *ExpRetTy = VectorType::get(
1465 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1466 auto LT = getTypeLegalizationCost(ExpRetTy);
1467
1468 // vid.v v8 // considered hoisted
1469 // vsaddu.vx v8, v8, a0
1470 // vmsltu.vx v0, v8, a1
1471 return LT.first *
1472 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1473 LT.second, CostKind);
1474 }
1475 break;
1476 }
1477 // TODO: add more intrinsic
1478 case Intrinsic::stepvector: {
1479 auto LT = getTypeLegalizationCost(RetTy);
1480 // Legalisation of illegal types involves an `index' instruction plus
1481 // (LT.first - 1) vector adds.
1482 if (ST->hasVInstructions())
1483 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1484 (LT.first - 1) *
1485 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1486 return 1 + (LT.first - 1);
1487 }
1488 case Intrinsic::experimental_cttz_elts: {
1489 Type *ArgTy = ICA.getArgTypes()[0];
1490 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1491 if (getTLI()->shouldExpandCttzElements(ArgType))
1492 break;
1493 InstructionCost Cost = getRISCVInstructionCost(
1494 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1495
1496 // If zero_is_poison is false, then we will generate additional
1497 // cmp + select instructions to convert -1 to EVL.
1498 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1499 if (ICA.getArgs().size() > 1 &&
1500 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1501 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1503 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1505
1506 return Cost;
1507 }
1508 case Intrinsic::experimental_vp_splat: {
1509 auto LT = getTypeLegalizationCost(RetTy);
1510 // TODO: Lower i1 experimental_vp_splat
1511 if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1)
1513 return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint()
1514 ? RISCV::VFMV_V_F
1515 : RISCV::VMV_V_X,
1516 LT.second, CostKind);
1517 }
1518 case Intrinsic::experimental_vp_splice: {
1519 // To support type-based query from vectorizer, set the index to 0.
1520 // Note that index only change the cost from vslide.vx to vslide.vi and in
1521 // current implementations they have same costs.
1522 return getShuffleCost(TTI::SK_Splice, cast<VectorType>(ICA.getReturnType()),
1523 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1524 0, cast<VectorType>(ICA.getReturnType()));
1525 }
1526 case Intrinsic::fptoui_sat:
1527 case Intrinsic::fptosi_sat: {
1529 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1530 Type *SrcTy = ICA.getArgTypes()[0];
1531
1532 auto SrcLT = getTypeLegalizationCost(SrcTy);
1533 auto DstLT = getTypeLegalizationCost(RetTy);
1534 if (!SrcTy->isVectorTy())
1535 break;
1536
1537 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1539
1540 Cost +=
1541 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1543
1544 // Handle NaN.
1545 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1546 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1547 Type *CondTy = RetTy->getWithNewBitWidth(1);
1548 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1550 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1552 return Cost;
1553 }
1554 }
1555
1556 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1557 if (auto LT = getTypeLegalizationCost(RetTy);
1558 LT.second.isVector()) {
1559 MVT EltTy = LT.second.getVectorElementType();
1560 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1561 ICA.getID(), EltTy))
1562 return LT.first * Entry->Cost;
1563 }
1564 }
1565
1567}
1568
1570 Type *Src,
1573 const Instruction *I) const {
1574 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1575 if (!IsVectorType)
1576 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1577
1578 // FIXME: Need to compute legalizing cost for illegal types. The current
1579 // code handles only legal types and those which can be trivially
1580 // promoted to legal.
1581 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1582 Dst->getScalarSizeInBits() > ST->getELen())
1583 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1584
1585 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1586 assert(ISD && "Invalid opcode");
1587 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1588 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1589
1590 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1591 // The shared implementation doesn't model vector widening during legalization
1592 // and instead assumes scalarization. In order to scalarize an <N x i1>
1593 // vector, we need to extend/trunc to/from i8. If we don't special case
1594 // this, we can get an infinite recursion cycle.
1595 switch (ISD) {
1596 default:
1597 break;
1598 case ISD::SIGN_EXTEND:
1599 case ISD::ZERO_EXTEND:
1600 if (Src->getScalarSizeInBits() == 1) {
1601 // We do not use vsext/vzext to extend from mask vector.
1602 // Instead we use the following instructions to extend from mask vector:
1603 // vmv.v.i v8, 0
1604 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1605 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1606 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1607 DstLT.second, CostKind) +
1608 DstLT.first - 1;
1609 }
1610 break;
1611 case ISD::TRUNCATE:
1612 if (Dst->getScalarSizeInBits() == 1) {
1613 // We do not use several vncvt to truncate to mask vector. So we could
1614 // not use PowDiff to calculate it.
1615 // Instead we use the following instructions to truncate to mask vector:
1616 // vand.vi v8, v8, 1
1617 // vmsne.vi v0, v8, 0
1618 return SrcLT.first *
1619 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1620 SrcLT.second, CostKind) +
1621 SrcLT.first - 1;
1622 }
1623 break;
1624 };
1625
1626 // Our actual lowering for the case where a wider legal type is available
1627 // uses promotion to the wider type. This is reflected in the result of
1628 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1629 // scalarized if the legalized Src and Dst are not equal sized.
1630 const DataLayout &DL = this->getDataLayout();
1631 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1632 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1634 SrcLT.second.getSizeInBits()) ||
1636 DstLT.second.getSizeInBits()))
1637 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1638
1639 // The split cost is handled by the base getCastInstrCost
1640 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1641
1642 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1643 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1644 switch (ISD) {
1645 case ISD::SIGN_EXTEND:
1646 case ISD::ZERO_EXTEND: {
1647 if ((PowDiff < 1) || (PowDiff > 3))
1648 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1649 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1650 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1651 unsigned Op =
1652 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1653 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1654 }
1655 case ISD::TRUNCATE:
1656 case ISD::FP_EXTEND:
1657 case ISD::FP_ROUND: {
1658 // Counts of narrow/widen instructions.
1659 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1660 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1661
1662 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1663 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1664 : RISCV::VFNCVT_F_F_W;
1666 for (; SrcEltSize != DstEltSize;) {
1667 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1668 ? MVT::getIntegerVT(DstEltSize)
1669 : MVT::getFloatingPointVT(DstEltSize);
1670 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1671 DstEltSize =
1672 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1673 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1674 }
1675 return Cost;
1676 }
1677 case ISD::FP_TO_SINT:
1678 case ISD::FP_TO_UINT: {
1679 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1680 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1681 unsigned FWCVT =
1682 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1683 unsigned FNCVT =
1684 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1685 unsigned SrcEltSize = Src->getScalarSizeInBits();
1686 unsigned DstEltSize = Dst->getScalarSizeInBits();
1688 if ((SrcEltSize == 16) &&
1689 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1690 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1691 // pre-widening to f32 and then convert f32 to integer
1692 VectorType *VecF32Ty =
1693 VectorType::get(Type::getFloatTy(Dst->getContext()),
1694 cast<VectorType>(Dst)->getElementCount());
1695 std::pair<InstructionCost, MVT> VecF32LT =
1696 getTypeLegalizationCost(VecF32Ty);
1697 Cost +=
1698 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1699 VecF32LT.second, CostKind);
1700 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1701 return Cost;
1702 }
1703 if (DstEltSize == SrcEltSize)
1704 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1705 else if (DstEltSize > SrcEltSize)
1706 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1707 else { // (SrcEltSize > DstEltSize)
1708 // First do a narrowing conversion to an integer half the size, then
1709 // truncate if needed.
1710 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1711 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1712 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1713 if ((SrcEltSize / 2) > DstEltSize) {
1714 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1715 Cost +=
1716 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1717 }
1718 }
1719 return Cost;
1720 }
1721 case ISD::SINT_TO_FP:
1722 case ISD::UINT_TO_FP: {
1723 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1724 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1725 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1726 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1727 unsigned SrcEltSize = Src->getScalarSizeInBits();
1728 unsigned DstEltSize = Dst->getScalarSizeInBits();
1729
1731 if ((DstEltSize == 16) &&
1732 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1733 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1734 // it is converted to f32 and then converted to f16
1735 VectorType *VecF32Ty =
1736 VectorType::get(Type::getFloatTy(Dst->getContext()),
1737 cast<VectorType>(Dst)->getElementCount());
1738 std::pair<InstructionCost, MVT> VecF32LT =
1739 getTypeLegalizationCost(VecF32Ty);
1740 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1741 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1742 DstLT.second, CostKind);
1743 return Cost;
1744 }
1745
1746 if (DstEltSize == SrcEltSize)
1747 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1748 else if (DstEltSize > SrcEltSize) {
1749 if ((DstEltSize / 2) > SrcEltSize) {
1750 VectorType *VecTy =
1751 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1752 cast<VectorType>(Dst)->getElementCount());
1753 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1754 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1755 }
1756 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1757 } else
1758 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1759 return Cost;
1760 }
1761 }
1762 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1763}
1764
1765unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
1766 if (isa<ScalableVectorType>(Ty)) {
1767 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1768 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1769 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1770 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1771 }
1772 return cast<FixedVectorType>(Ty)->getNumElements();
1773}
1774
1777 FastMathFlags FMF,
1779 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1780 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1781
1782 // Skip if scalar size of Ty is bigger than ELEN.
1783 if (Ty->getScalarSizeInBits() > ST->getELen())
1784 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1785
1786 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1787 if (Ty->getElementType()->isIntegerTy(1)) {
1788 // SelectionDAGBuilder does following transforms:
1789 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1790 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1791 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1792 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1793 else
1794 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1795 }
1796
1797 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1799 InstructionCost ExtraCost = 0;
1800 switch (IID) {
1801 case Intrinsic::maximum:
1802 if (FMF.noNaNs()) {
1803 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1804 } else {
1805 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1806 RISCV::VFMV_F_S};
1807 // Cost of Canonical Nan + branch
1808 // lui a0, 523264
1809 // fmv.w.x fa0, a0
1810 Type *DstTy = Ty->getScalarType();
1811 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1812 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1813 ExtraCost = 1 +
1814 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1816 getCFInstrCost(Instruction::Br, CostKind);
1817 }
1818 break;
1819
1820 case Intrinsic::minimum:
1821 if (FMF.noNaNs()) {
1822 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1823 } else {
1824 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1825 RISCV::VFMV_F_S};
1826 // Cost of Canonical Nan + branch
1827 // lui a0, 523264
1828 // fmv.w.x fa0, a0
1829 Type *DstTy = Ty->getScalarType();
1830 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1831 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1832 ExtraCost = 1 +
1833 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1835 getCFInstrCost(Instruction::Br, CostKind);
1836 }
1837 break;
1838 }
1839 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1840 }
1841
1842 // IR Reduction is composed by one rvv reduction instruction and vmv
1843 unsigned SplitOp;
1845 switch (IID) {
1846 default:
1847 llvm_unreachable("Unsupported intrinsic");
1848 case Intrinsic::smax:
1849 SplitOp = RISCV::VMAX_VV;
1850 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1851 break;
1852 case Intrinsic::smin:
1853 SplitOp = RISCV::VMIN_VV;
1854 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1855 break;
1856 case Intrinsic::umax:
1857 SplitOp = RISCV::VMAXU_VV;
1858 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1859 break;
1860 case Intrinsic::umin:
1861 SplitOp = RISCV::VMINU_VV;
1862 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1863 break;
1864 case Intrinsic::maxnum:
1865 SplitOp = RISCV::VFMAX_VV;
1866 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1867 break;
1868 case Intrinsic::minnum:
1869 SplitOp = RISCV::VFMIN_VV;
1870 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1871 break;
1872 }
1873 // Add a cost for data larger than LMUL8
1874 InstructionCost SplitCost =
1875 (LT.first > 1) ? (LT.first - 1) *
1876 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1877 : 0;
1878 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1879}
1880
1883 std::optional<FastMathFlags> FMF,
1885 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1886 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1887
1888 // Skip if scalar size of Ty is bigger than ELEN.
1889 if (Ty->getScalarSizeInBits() > ST->getELen())
1890 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1891
1892 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1893 assert(ISD && "Invalid opcode");
1894
1895 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1896 ISD != ISD::FADD)
1897 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1898
1899 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1900 Type *ElementTy = Ty->getElementType();
1901 if (ElementTy->isIntegerTy(1)) {
1902 // Example sequences:
1903 // vfirst.m a0, v0
1904 // seqz a0, a0
1905 if (LT.second == MVT::v1i1)
1906 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
1907 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1909
1910 if (ISD == ISD::AND) {
1911 // Example sequences:
1912 // vmand.mm v8, v9, v8 ; needed every time type is split
1913 // vmnot.m v8, v0 ; alias for vmnand
1914 // vcpop.m a0, v8
1915 // seqz a0, a0
1916
1917 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
1918 // For LMUL <= 8, there is no splitting,
1919 // the sequences are vmnot, vcpop and seqz.
1920 // When LMUL > 8 and split = 1,
1921 // the sequences are vmnand, vcpop and seqz.
1922 // When LMUL > 8 and split > 1,
1923 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
1924 return ((LT.first > 2) ? (LT.first - 2) : 0) *
1925 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
1926 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
1927 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1928 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1930 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
1931 // Example sequences:
1932 // vsetvli a0, zero, e8, mf8, ta, ma
1933 // vmxor.mm v8, v0, v8 ; needed every time type is split
1934 // vcpop.m a0, v8
1935 // andi a0, a0, 1
1936 return (LT.first - 1) *
1937 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
1938 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
1939 } else {
1940 assert(ISD == ISD::OR);
1941 // Example sequences:
1942 // vsetvli a0, zero, e8, mf8, ta, ma
1943 // vmor.mm v8, v9, v8 ; needed every time type is split
1944 // vcpop.m a0, v0
1945 // snez a0, a0
1946 return (LT.first - 1) *
1947 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
1948 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1949 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1951 }
1952 }
1953
1954 // IR Reduction of or/and is composed by one vmv and one rvv reduction
1955 // instruction, and others is composed by two vmv and one rvv reduction
1956 // instruction
1957 unsigned SplitOp;
1959 switch (ISD) {
1960 case ISD::ADD:
1961 SplitOp = RISCV::VADD_VV;
1962 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1963 break;
1964 case ISD::OR:
1965 SplitOp = RISCV::VOR_VV;
1966 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
1967 break;
1968 case ISD::XOR:
1969 SplitOp = RISCV::VXOR_VV;
1970 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1971 break;
1972 case ISD::AND:
1973 SplitOp = RISCV::VAND_VV;
1974 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
1975 break;
1976 case ISD::FADD:
1977 // We can't promote f16/bf16 fadd reductions.
1978 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
1979 LT.second.getScalarType() == MVT::bf16)
1980 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1982 Opcodes.push_back(RISCV::VFMV_S_F);
1983 for (unsigned i = 0; i < LT.first.getValue(); i++)
1984 Opcodes.push_back(RISCV::VFREDOSUM_VS);
1985 Opcodes.push_back(RISCV::VFMV_F_S);
1986 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1987 }
1988 SplitOp = RISCV::VFADD_VV;
1989 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
1990 break;
1991 }
1992 // Add a cost for data larger than LMUL8
1993 InstructionCost SplitCost =
1994 (LT.first > 1) ? (LT.first - 1) *
1995 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1996 : 0;
1997 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1998}
1999
2001 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2002 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2003 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2004 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2005 FMF, CostKind);
2006
2007 // Skip if scalar size of ResTy is bigger than ELEN.
2008 if (ResTy->getScalarSizeInBits() > ST->getELen())
2009 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2010 FMF, CostKind);
2011
2012 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2013 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2014 FMF, CostKind);
2015
2016 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2017
2018 if (IsUnsigned && Opcode == Instruction::Add &&
2019 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2020 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2021 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2022 return LT.first *
2023 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2024 }
2025
2026 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2027 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2028 FMF, CostKind);
2029
2030 return (LT.first - 1) +
2031 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2032}
2033
2037 assert(OpInfo.isConstant() && "non constant operand?");
2038 if (!isa<VectorType>(Ty))
2039 // FIXME: We need to account for immediate materialization here, but doing
2040 // a decent job requires more knowledge about the immediate than we
2041 // currently have here.
2042 return 0;
2043
2044 if (OpInfo.isUniform())
2045 // vmv.v.i, vmv.v.x, or vfmv.v.f
2046 // We ignore the cost of the scalar constant materialization to be consistent
2047 // with how we treat scalar constants themselves just above.
2048 return 1;
2049
2050 return getConstantPoolLoadCost(Ty, CostKind);
2051}
2052
2054 Align Alignment,
2055 unsigned AddressSpace,
2057 TTI::OperandValueInfo OpInfo,
2058 const Instruction *I) const {
2059 EVT VT = TLI->getValueType(DL, Src, true);
2060 // Type legalization can't handle structs
2061 if (VT == MVT::Other)
2062 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2063 CostKind, OpInfo, I);
2064
2066 if (Opcode == Instruction::Store && OpInfo.isConstant())
2067 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2068
2069 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2070
2071 InstructionCost BaseCost = [&]() {
2072 InstructionCost Cost = LT.first;
2074 return Cost;
2075
2076 // Our actual lowering for the case where a wider legal type is available
2077 // uses the a VL predicated load on the wider type. This is reflected in
2078 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2079 // widened cases are scalarized.
2080 const DataLayout &DL = this->getDataLayout();
2081 if (Src->isVectorTy() && LT.second.isVector() &&
2083 LT.second.getSizeInBits()))
2084 return Cost;
2085
2086 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2087 CostKind, OpInfo, I);
2088 }();
2089
2090 // Assume memory ops cost scale with the number of vector registers
2091 // possible accessed by the instruction. Note that BasicTTI already
2092 // handles the LT.first term for us.
2093 if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
2094 BaseCost *= TLI->getLMULCost(LT.second);
2095 return Cost + BaseCost;
2096}
2097
2099 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2101 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2103 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2104 Op1Info, Op2Info, I);
2105
2106 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2107 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2108 Op1Info, Op2Info, I);
2109
2110 // Skip if scalar size of ValTy is bigger than ELEN.
2111 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2112 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2113 Op1Info, Op2Info, I);
2114
2115 auto GetConstantMatCost =
2116 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2117 if (OpInfo.isUniform())
2118 // We return 0 we currently ignore the cost of materializing scalar
2119 // constants in GPRs.
2120 return 0;
2121
2122 return getConstantPoolLoadCost(ValTy, CostKind);
2123 };
2124
2125 InstructionCost ConstantMatCost;
2126 if (Op1Info.isConstant())
2127 ConstantMatCost += GetConstantMatCost(Op1Info);
2128 if (Op2Info.isConstant())
2129 ConstantMatCost += GetConstantMatCost(Op2Info);
2130
2131 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2132 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2133 if (CondTy->isVectorTy()) {
2134 if (ValTy->getScalarSizeInBits() == 1) {
2135 // vmandn.mm v8, v8, v9
2136 // vmand.mm v9, v0, v9
2137 // vmor.mm v0, v9, v8
2138 return ConstantMatCost +
2139 LT.first *
2140 getRISCVInstructionCost(
2141 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2142 LT.second, CostKind);
2143 }
2144 // vselect and max/min are supported natively.
2145 return ConstantMatCost +
2146 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2147 CostKind);
2148 }
2149
2150 if (ValTy->getScalarSizeInBits() == 1) {
2151 // vmv.v.x v9, a0
2152 // vmsne.vi v9, v9, 0
2153 // vmandn.mm v8, v8, v9
2154 // vmand.mm v9, v0, v9
2155 // vmor.mm v0, v9, v8
2156 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2157 return ConstantMatCost +
2158 LT.first *
2159 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2160 InterimVT, CostKind) +
2161 LT.first * getRISCVInstructionCost(
2162 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2163 LT.second, CostKind);
2164 }
2165
2166 // vmv.v.x v10, a0
2167 // vmsne.vi v0, v10, 0
2168 // vmerge.vvm v8, v9, v8, v0
2169 return ConstantMatCost +
2170 LT.first * getRISCVInstructionCost(
2171 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2172 LT.second, CostKind);
2173 }
2174
2175 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2176 CmpInst::isIntPredicate(VecPred)) {
2177 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2178 // provided they incur the same cost across all implementations
2179 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2180 LT.second,
2181 CostKind);
2182 }
2183
2184 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2185 CmpInst::isFPPredicate(VecPred)) {
2186
2187 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2188 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2189 return ConstantMatCost +
2190 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2191
2192 // If we do not support the input floating point vector type, use the base
2193 // one which will calculate as:
2194 // ScalarizeCost + Num * Cost for fixed vector,
2195 // InvalidCost for scalable vector.
2196 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2197 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2198 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2199 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2200 Op1Info, Op2Info, I);
2201
2202 // Assuming vector fp compare and mask instructions are all the same cost
2203 // until a need arises to differentiate them.
2204 switch (VecPred) {
2205 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2206 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2207 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2208 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2209 return ConstantMatCost +
2210 LT.first * getRISCVInstructionCost(
2211 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2212 LT.second, CostKind);
2213
2214 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2215 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2216 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2217 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2218 return ConstantMatCost +
2219 LT.first *
2220 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2221 LT.second, CostKind);
2222
2223 case CmpInst::FCMP_OEQ: // vmfeq.vv
2224 case CmpInst::FCMP_OGT: // vmflt.vv
2225 case CmpInst::FCMP_OGE: // vmfle.vv
2226 case CmpInst::FCMP_OLT: // vmflt.vv
2227 case CmpInst::FCMP_OLE: // vmfle.vv
2228 case CmpInst::FCMP_UNE: // vmfne.vv
2229 return ConstantMatCost +
2230 LT.first *
2231 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2232 default:
2233 break;
2234 }
2235 }
2236
2237 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2238 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2239 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2240 // be (0 + select instr cost).
2241 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2242 ValTy->isIntegerTy() && !I->user_empty()) {
2243 if (all_of(I->users(), [&](const User *U) {
2244 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2245 U->getType()->isIntegerTy() &&
2246 !isa<ConstantData>(U->getOperand(1)) &&
2247 !isa<ConstantData>(U->getOperand(2));
2248 }))
2249 return 0;
2250 }
2251
2252 // TODO: Add cost for scalar type.
2253
2254 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2255 Op1Info, Op2Info, I);
2256}
2257
2260 const Instruction *I) const {
2262 return Opcode == Instruction::PHI ? 0 : 1;
2263 // Branches are assumed to be predicted.
2264 return 0;
2265}
2266
2269 unsigned Index,
2270 const Value *Op0,
2271 const Value *Op1) const {
2272 assert(Val->isVectorTy() && "This must be a vector type");
2273
2274 if (Opcode != Instruction::ExtractElement &&
2275 Opcode != Instruction::InsertElement)
2276 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
2277
2278 // Legalize the type.
2279 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2280
2281 // This type is legalized to a scalar type.
2282 if (!LT.second.isVector()) {
2283 auto *FixedVecTy = cast<FixedVectorType>(Val);
2284 // If Index is a known constant, cost is zero.
2285 if (Index != -1U)
2286 return 0;
2287 // Extract/InsertElement with non-constant index is very costly when
2288 // scalarized; estimate cost of loads/stores sequence via the stack:
2289 // ExtractElement cost: store vector to stack, load scalar;
2290 // InsertElement cost: store vector to stack, store scalar, load vector.
2291 Type *ElemTy = FixedVecTy->getElementType();
2292 auto NumElems = FixedVecTy->getNumElements();
2293 auto Align = DL.getPrefTypeAlign(ElemTy);
2294 InstructionCost LoadCost =
2295 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2296 InstructionCost StoreCost =
2297 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2298 return Opcode == Instruction::ExtractElement
2299 ? StoreCost * NumElems + LoadCost
2300 : (StoreCost + LoadCost) * NumElems + StoreCost;
2301 }
2302
2303 // For unsupported scalable vector.
2304 if (LT.second.isScalableVector() && !LT.first.isValid())
2305 return LT.first;
2306
2307 // Mask vector extract/insert is expanded via e8.
2308 if (Val->getScalarSizeInBits() == 1) {
2309 VectorType *WideTy =
2311 cast<VectorType>(Val)->getElementCount());
2312 if (Opcode == Instruction::ExtractElement) {
2313 InstructionCost ExtendCost
2314 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2316 InstructionCost ExtractCost
2317 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2318 return ExtendCost + ExtractCost;
2319 }
2320 InstructionCost ExtendCost
2321 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2323 InstructionCost InsertCost
2324 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2325 InstructionCost TruncCost
2326 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2328 return ExtendCost + InsertCost + TruncCost;
2329 }
2330
2331
2332 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2333 // and vslideup + vmv.s.x to insert element to vector.
2334 unsigned BaseCost = 1;
2335 // When insertelement we should add the index with 1 as the input of vslideup.
2336 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2337
2338 if (Index != -1U) {
2339 // The type may be split. For fixed-width vectors we can normalize the
2340 // index to the new type.
2341 if (LT.second.isFixedLengthVector()) {
2342 unsigned Width = LT.second.getVectorNumElements();
2343 Index = Index % Width;
2344 }
2345
2346 // If exact VLEN is known, we will insert/extract into the appropriate
2347 // subvector with no additional subvector insert/extract cost.
2348 if (auto VLEN = ST->getRealVLen()) {
2349 unsigned EltSize = LT.second.getScalarSizeInBits();
2350 unsigned M1Max = *VLEN / EltSize;
2351 Index = Index % M1Max;
2352 }
2353
2354 if (Index == 0)
2355 // We can extract/insert the first element without vslidedown/vslideup.
2356 SlideCost = 0;
2357 else if (ST->hasVendorXRivosVisni() && isUInt<5>(Index) &&
2358 Val->getScalarType()->isIntegerTy())
2359 SlideCost = 0; // With ri.vinsert/ri.vextract there is no slide needed
2360 else if (Opcode == Instruction::InsertElement)
2361 SlideCost = 1; // With a constant index, we do not need to use addi.
2362 }
2363
2364 // When the vector needs to split into multiple register groups and the index
2365 // exceeds single vector register group, we need to insert/extract the element
2366 // via stack.
2367 if (LT.first > 1 &&
2368 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2369 LT.second.isScalableVector()))) {
2370 Type *ScalarType = Val->getScalarType();
2371 Align VecAlign = DL.getPrefTypeAlign(Val);
2372 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2373 // Extra addi for unknown index.
2374 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2375
2376 // Store all split vectors into stack and load the target element.
2377 if (Opcode == Instruction::ExtractElement)
2378 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2379 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2380 CostKind) +
2381 IdxCost;
2382
2383 // Store all split vectors into stack and store the target element and load
2384 // vectors back.
2385 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2386 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2387 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2388 CostKind) +
2389 IdxCost;
2390 }
2391
2392 // Extract i64 in the target that has XLEN=32 need more instruction.
2393 if (Val->getScalarType()->isIntegerTy() &&
2394 ST->getXLen() < Val->getScalarSizeInBits()) {
2395 // For extractelement, we need the following instructions:
2396 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2397 // vslidedown.vx v8, v8, a0
2398 // vmv.x.s a0, v8
2399 // li a1, 32
2400 // vsrl.vx v8, v8, a1
2401 // vmv.x.s a1, v8
2402
2403 // For insertelement, we need the following instructions:
2404 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2405 // vmv.v.i v12, 0
2406 // vslide1up.vx v16, v12, a1
2407 // vslide1up.vx v12, v16, a0
2408 // addi a0, a2, 1
2409 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2410 // vslideup.vx v8, v12, a2
2411
2412 // TODO: should we count these special vsetvlis?
2413 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2414 }
2415 return BaseCost + SlideCost;
2416}
2417
2421 unsigned Index) const {
2422 if (isa<FixedVectorType>(Val))
2424 Index);
2425
2426 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2427 // for the cost of extracting the last lane of a scalable vector. It probably
2428 // needs a more accurate cost.
2429 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2430 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2431 return getVectorInstrCost(Opcode, Val, CostKind,
2432 EC.getKnownMinValue() - 1 - Index, nullptr,
2433 nullptr);
2434}
2435
2437 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2439 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2440
2441 // TODO: Handle more cost kinds.
2443 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2444 Args, CxtI);
2445
2446 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2447 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2448 Args, CxtI);
2449
2450 // Skip if scalar size of Ty is bigger than ELEN.
2451 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2452 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2453 Args, CxtI);
2454
2455 // Legalize the type.
2456 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2457
2458 // TODO: Handle scalar type.
2459 if (!LT.second.isVector())
2460 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2461 Args, CxtI);
2462
2463 // f16 with zvfhmin and bf16 will be promoted to f32.
2464 // FIXME: nxv32[b]f16 will be custom lowered and split.
2465 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2466 InstructionCost CastCost = 0;
2467 if ((LT.second.getVectorElementType() == MVT::f16 ||
2468 LT.second.getVectorElementType() == MVT::bf16) &&
2469 TLI->getOperationAction(ISDOpcode, LT.second) ==
2471 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2472 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2473 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2474 // Add cost of extending arguments
2475 CastCost += LT.first * Args.size() *
2476 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2478 // Add cost of truncating result
2479 CastCost +=
2480 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2482 // Compute cost of op in promoted type
2483 LT.second = PromotedVT;
2484 }
2485
2486 auto getConstantMatCost =
2487 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2488 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2489 // Two sub-cases:
2490 // * Has a 5 bit immediate operand which can be splatted.
2491 // * Has a larger immediate which must be materialized in scalar register
2492 // We return 0 for both as we currently ignore the cost of materializing
2493 // scalar constants in GPRs.
2494 return 0;
2495
2496 return getConstantPoolLoadCost(Ty, CostKind);
2497 };
2498
2499 // Add the cost of materializing any constant vectors required.
2500 InstructionCost ConstantMatCost = 0;
2501 if (Op1Info.isConstant())
2502 ConstantMatCost += getConstantMatCost(0, Op1Info);
2503 if (Op2Info.isConstant())
2504 ConstantMatCost += getConstantMatCost(1, Op2Info);
2505
2506 unsigned Op;
2507 switch (ISDOpcode) {
2508 case ISD::ADD:
2509 case ISD::SUB:
2510 Op = RISCV::VADD_VV;
2511 break;
2512 case ISD::SHL:
2513 case ISD::SRL:
2514 case ISD::SRA:
2515 Op = RISCV::VSLL_VV;
2516 break;
2517 case ISD::AND:
2518 case ISD::OR:
2519 case ISD::XOR:
2520 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2521 break;
2522 case ISD::MUL:
2523 case ISD::MULHS:
2524 case ISD::MULHU:
2525 Op = RISCV::VMUL_VV;
2526 break;
2527 case ISD::SDIV:
2528 case ISD::UDIV:
2529 Op = RISCV::VDIV_VV;
2530 break;
2531 case ISD::SREM:
2532 case ISD::UREM:
2533 Op = RISCV::VREM_VV;
2534 break;
2535 case ISD::FADD:
2536 case ISD::FSUB:
2537 Op = RISCV::VFADD_VV;
2538 break;
2539 case ISD::FMUL:
2540 Op = RISCV::VFMUL_VV;
2541 break;
2542 case ISD::FDIV:
2543 Op = RISCV::VFDIV_VV;
2544 break;
2545 case ISD::FNEG:
2546 Op = RISCV::VFSGNJN_VV;
2547 break;
2548 default:
2549 // Assuming all other instructions have the same cost until a need arises to
2550 // differentiate them.
2551 return CastCost + ConstantMatCost +
2552 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2553 Args, CxtI);
2554 }
2555
2556 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2557 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2558 // ops are twice as expensive as integer ops. Do the same for vectors so
2559 // scalar floating point ops aren't cheaper than their vector equivalents.
2560 if (Ty->isFPOrFPVectorTy())
2561 InstrCost *= 2;
2562 return CastCost + ConstantMatCost + LT.first * InstrCost;
2563}
2564
2565// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2567 ArrayRef<const Value *> Ptrs, const Value *Base,
2568 const TTI::PointersChainInfo &Info, Type *AccessTy,
2571 // In the basic model we take into account GEP instructions only
2572 // (although here can come alloca instruction, a value, constants and/or
2573 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2574 // pointer). Typically, if Base is a not a GEP-instruction and all the
2575 // pointers are relative to the same base address, all the rest are
2576 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2577 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2578 // any their index is a non-const.
2579 // If no known dependencies between the pointers cost is calculated as a sum
2580 // of costs of GEP instructions.
2581 for (auto [I, V] : enumerate(Ptrs)) {
2582 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2583 if (!GEP)
2584 continue;
2585 if (Info.isSameBase() && V != Base) {
2586 if (GEP->hasAllConstantIndices())
2587 continue;
2588 // If the chain is unit-stride and BaseReg + stride*i is a legal
2589 // addressing mode, then presume the base GEP is sitting around in a
2590 // register somewhere and check if we can fold the offset relative to
2591 // it.
2592 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2593 if (Info.isUnitStride() &&
2594 isLegalAddressingMode(AccessTy,
2595 /* BaseGV */ nullptr,
2596 /* BaseOffset */ Stride * I,
2597 /* HasBaseReg */ true,
2598 /* Scale */ 0,
2599 GEP->getType()->getPointerAddressSpace()))
2600 continue;
2601 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2602 {TTI::OK_AnyValue, TTI::OP_None},
2603 {TTI::OK_AnyValue, TTI::OP_None}, {});
2604 } else {
2605 SmallVector<const Value *> Indices(GEP->indices());
2606 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2607 Indices, AccessTy, CostKind);
2608 }
2609 }
2610 return Cost;
2611}
2612
2615 OptimizationRemarkEmitter *ORE) const {
2616 // TODO: More tuning on benchmarks and metrics with changes as needed
2617 // would apply to all settings below to enable performance.
2618
2619
2620 if (ST->enableDefaultUnroll())
2621 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2622
2623 // Enable Upper bound unrolling universally, not dependent upon the conditions
2624 // below.
2625 UP.UpperBound = true;
2626
2627 // Disable loop unrolling for Oz and Os.
2628 UP.OptSizeThreshold = 0;
2630 if (L->getHeader()->getParent()->hasOptSize())
2631 return;
2632
2633 SmallVector<BasicBlock *, 4> ExitingBlocks;
2634 L->getExitingBlocks(ExitingBlocks);
2635 LLVM_DEBUG(dbgs() << "Loop has:\n"
2636 << "Blocks: " << L->getNumBlocks() << "\n"
2637 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2638
2639 // Only allow another exit other than the latch. This acts as an early exit
2640 // as it mirrors the profitability calculation of the runtime unroller.
2641 if (ExitingBlocks.size() > 2)
2642 return;
2643
2644 // Limit the CFG of the loop body for targets with a branch predictor.
2645 // Allowing 4 blocks permits if-then-else diamonds in the body.
2646 if (L->getNumBlocks() > 4)
2647 return;
2648
2649 // Scan the loop: don't unroll loops with calls as this could prevent
2650 // inlining. Don't unroll auto-vectorized loops either, though do allow
2651 // unrolling of the scalar remainder.
2652 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2654 for (auto *BB : L->getBlocks()) {
2655 for (auto &I : *BB) {
2656 // Both auto-vectorized loops and the scalar remainder have the
2657 // isvectorized attribute, so differentiate between them by the presence
2658 // of vector instructions.
2659 if (IsVectorized && I.getType()->isVectorTy())
2660 return;
2661
2662 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2663 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2664 if (!isLoweredToCall(F))
2665 continue;
2666 }
2667 return;
2668 }
2669
2670 SmallVector<const Value *> Operands(I.operand_values());
2673 }
2674 }
2675
2676 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2677
2678 UP.Partial = true;
2679 UP.Runtime = true;
2680 UP.UnrollRemainder = true;
2681 UP.UnrollAndJam = true;
2682
2683 // Force unrolling small loops can be very useful because of the branch
2684 // taken cost of the backedge.
2685 if (Cost < 12)
2686 UP.Force = true;
2687}
2688
2690 TTI::PeelingPreferences &PP) const {
2692}
2693
2695 if (Ty->isVectorTy()) {
2696 // f16 with only zvfhmin and bf16 will be promoted to f32
2697 Type *EltTy = cast<VectorType>(Ty)->getElementType();
2698 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
2699 EltTy->isBFloatTy())
2701 cast<VectorType>(Ty));
2702
2704 if (Size.isScalable() && ST->hasVInstructions())
2705 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
2706
2708 return divideCeil(Size, ST->getRealMinVLen());
2709 }
2710
2711 return BaseT::getRegUsageForType(Ty);
2712}
2713
2714unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2715 if (SLPMaxVF.getNumOccurrences())
2716 return SLPMaxVF;
2717
2718 // Return how many elements can fit in getRegisterBitwidth. This is the
2719 // same routine as used in LoopVectorizer. We should probably be
2720 // accounting for whether we actually have instructions with the right
2721 // lane type, but we don't have enough information to do that without
2722 // some additional plumbing which hasn't been justified yet.
2723 TypeSize RegWidth =
2725 // If no vector registers, or absurd element widths, disable
2726 // vectorization by returning 1.
2727 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
2728}
2729
2731 return RVVMinTripCount;
2732}
2733
2736 ScalarEvolution *SE) const {
2737 if (ST->hasVendorXCVmem() && !ST->is64Bit())
2738 return TTI::AMK_PostIndexed;
2739
2741}
2742
2744 const TargetTransformInfo::LSRCost &C2) const {
2745 // RISC-V specific here are "instruction number 1st priority".
2746 // If we need to emit adds inside the loop to add up base registers, then
2747 // we need at least one extra temporary register.
2748 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
2749 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
2750 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
2751 C1.NumIVMuls, C1.NumBaseAdds,
2752 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
2753 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
2754 C2.NumIVMuls, C2.NumBaseAdds,
2755 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
2756}
2757
2759 Align Alignment) const {
2760 auto *VTy = dyn_cast<VectorType>(DataTy);
2761 if (!VTy || VTy->isScalableTy())
2762 return false;
2763
2764 if (!isLegalMaskedLoadStore(DataTy, Alignment))
2765 return false;
2766
2767 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
2768 // scalarize these types with LMUL >= maximum fixed-length LMUL.
2769 if (VTy->getElementType()->isIntegerTy(8))
2770 if (VTy->getElementCount().getFixedValue() > 256)
2771 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
2773 return true;
2774}
2775
2777 Align Alignment) const {
2778 auto *VTy = dyn_cast<VectorType>(DataTy);
2779 if (!VTy || VTy->isScalableTy())
2780 return false;
2781
2782 if (!isLegalMaskedLoadStore(DataTy, Alignment))
2783 return false;
2784 return true;
2785}
2786
2787/// See if \p I should be considered for address type promotion. We check if \p
2788/// I is a sext with right type and used in memory accesses. If it used in a
2789/// "complex" getelementptr, we allow it to be promoted without finding other
2790/// sext instructions that sign extended the same initial value. A getelementptr
2791/// is considered as "complex" if it has more than 2 operands.
2793 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
2794 bool Considerable = false;
2795 AllowPromotionWithoutCommonHeader = false;
2796 if (!isa<SExtInst>(&I))
2797 return false;
2798 Type *ConsideredSExtType =
2799 Type::getInt64Ty(I.getParent()->getParent()->getContext());
2800 if (I.getType() != ConsideredSExtType)
2801 return false;
2802 // See if the sext is the one with the right type and used in at least one
2803 // GetElementPtrInst.
2804 for (const User *U : I.users()) {
2805 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2806 Considerable = true;
2807 // A getelementptr is considered as "complex" if it has more than 2
2808 // operands. We will promote a SExt used in such complex GEP as we
2809 // expect some computation to be merged if they are done on 64 bits.
2810 if (GEPInst->getNumOperands() > 2) {
2811 AllowPromotionWithoutCommonHeader = true;
2812 break;
2813 }
2814 }
2815 }
2816 return Considerable;
2817}
2818
2819bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
2820 switch (Opcode) {
2821 case Instruction::Add:
2822 case Instruction::Sub:
2823 case Instruction::Mul:
2824 case Instruction::And:
2825 case Instruction::Or:
2826 case Instruction::Xor:
2827 case Instruction::FAdd:
2828 case Instruction::FSub:
2829 case Instruction::FMul:
2830 case Instruction::FDiv:
2831 case Instruction::ICmp:
2832 case Instruction::FCmp:
2833 return true;
2834 case Instruction::Shl:
2835 case Instruction::LShr:
2836 case Instruction::AShr:
2837 case Instruction::UDiv:
2838 case Instruction::SDiv:
2839 case Instruction::URem:
2840 case Instruction::SRem:
2841 case Instruction::Select:
2842 return Operand == 1;
2843 default:
2844 return false;
2845 }
2846}
2847
2849 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
2850 return false;
2851
2852 if (canSplatOperand(I->getOpcode(), Operand))
2853 return true;
2854
2855 auto *II = dyn_cast<IntrinsicInst>(I);
2856 if (!II)
2857 return false;
2858
2859 switch (II->getIntrinsicID()) {
2860 case Intrinsic::fma:
2861 case Intrinsic::vp_fma:
2862 case Intrinsic::fmuladd:
2863 case Intrinsic::vp_fmuladd:
2864 return Operand == 0 || Operand == 1;
2865 case Intrinsic::vp_shl:
2866 case Intrinsic::vp_lshr:
2867 case Intrinsic::vp_ashr:
2868 case Intrinsic::vp_udiv:
2869 case Intrinsic::vp_sdiv:
2870 case Intrinsic::vp_urem:
2871 case Intrinsic::vp_srem:
2872 case Intrinsic::ssub_sat:
2873 case Intrinsic::vp_ssub_sat:
2874 case Intrinsic::usub_sat:
2875 case Intrinsic::vp_usub_sat:
2876 case Intrinsic::vp_select:
2877 return Operand == 1;
2878 // These intrinsics are commutative.
2879 case Intrinsic::vp_add:
2880 case Intrinsic::vp_mul:
2881 case Intrinsic::vp_and:
2882 case Intrinsic::vp_or:
2883 case Intrinsic::vp_xor:
2884 case Intrinsic::vp_fadd:
2885 case Intrinsic::vp_fmul:
2886 case Intrinsic::vp_icmp:
2887 case Intrinsic::vp_fcmp:
2888 case Intrinsic::smin:
2889 case Intrinsic::vp_smin:
2890 case Intrinsic::umin:
2891 case Intrinsic::vp_umin:
2892 case Intrinsic::smax:
2893 case Intrinsic::vp_smax:
2894 case Intrinsic::umax:
2895 case Intrinsic::vp_umax:
2896 case Intrinsic::sadd_sat:
2897 case Intrinsic::vp_sadd_sat:
2898 case Intrinsic::uadd_sat:
2899 case Intrinsic::vp_uadd_sat:
2900 // These intrinsics have 'vr' versions.
2901 case Intrinsic::vp_sub:
2902 case Intrinsic::vp_fsub:
2903 case Intrinsic::vp_fdiv:
2904 return Operand == 0 || Operand == 1;
2905 default:
2906 return false;
2907 }
2908}
2909
2910/// Check if sinking \p I's operands to I's basic block is profitable, because
2911/// the operands can be folded into a target instruction, e.g.
2912/// splats of scalars can fold into vector instructions.
2914 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
2915 using namespace llvm::PatternMatch;
2916
2917 if (I->isBitwiseLogicOp()) {
2918 if (!I->getType()->isVectorTy()) {
2919 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
2920 for (auto &Op : I->operands()) {
2921 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
2922 if (match(Op.get(), m_Not(m_Value()))) {
2923 Ops.push_back(&Op);
2924 return true;
2925 }
2926 }
2927 }
2928 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
2929 for (auto &Op : I->operands()) {
2930 // (and X, (not Y)) -> (vandn.vv X, Y)
2931 if (match(Op.get(), m_Not(m_Value()))) {
2932 Ops.push_back(&Op);
2933 return true;
2934 }
2935 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
2937 m_ZeroInt()),
2938 m_Value(), m_ZeroMask()))) {
2939 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
2940 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
2941 Ops.push_back(&Not);
2942 Ops.push_back(&InsertElt);
2943 Ops.push_back(&Op);
2944 return true;
2945 }
2946 }
2947 }
2948 }
2949
2950 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
2951 return false;
2952
2953 // Don't sink splat operands if the target prefers it. Some targets requires
2954 // S2V transfer buffers and we can run out of them copying the same value
2955 // repeatedly.
2956 // FIXME: It could still be worth doing if it would improve vector register
2957 // pressure and prevent a vector spill.
2958 if (!ST->sinkSplatOperands())
2959 return false;
2960
2961 for (auto OpIdx : enumerate(I->operands())) {
2962 if (!canSplatOperand(I, OpIdx.index()))
2963 continue;
2964
2965 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
2966 // Make sure we are not already sinking this operand
2967 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
2968 continue;
2969
2970 // We are looking for a splat/vp.splat that can be sunk.
2971 bool IsVPSplat = match(Op, m_Intrinsic<Intrinsic::experimental_vp_splat>(
2972 m_Value(), m_Value(), m_Value()));
2973 if (!IsVPSplat &&
2975 m_Undef(), m_ZeroMask())))
2976 continue;
2977
2978 // Don't sink i1 splats.
2979 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
2980 continue;
2981
2982 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
2983 // and vector registers
2984 for (Use &U : Op->uses()) {
2985 Instruction *Insn = cast<Instruction>(U.getUser());
2986 if (!canSplatOperand(Insn, U.getOperandNo()))
2987 return false;
2988 }
2989
2990 // Sink any fpexts since they might be used in a widening fp pattern.
2991 if (IsVPSplat) {
2992 if (isa<FPExtInst>(Op->getOperand(0)))
2993 Ops.push_back(&Op->getOperandUse(0));
2994 } else {
2995 Use *InsertEltUse = &Op->getOperandUse(0);
2996 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
2997 if (isa<FPExtInst>(InsertElt->getOperand(1)))
2998 Ops.push_back(&InsertElt->getOperandUse(1));
2999 Ops.push_back(InsertEltUse);
3000 }
3001 Ops.push_back(&OpIdx.value());
3002 }
3003 return true;
3004}
3005
3007RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3009 // TODO: Enable expansion when unaligned access is not supported after we fix
3010 // issues in ExpandMemcmp.
3011 if (!ST->enableUnalignedScalarMem())
3012 return Options;
3013
3014 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3015 return Options;
3016
3017 Options.AllowOverlappingLoads = true;
3018 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3019 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3020 if (ST->is64Bit()) {
3021 Options.LoadSizes = {8, 4, 2, 1};
3022 Options.AllowedTailExpansions = {3, 5, 6};
3023 } else {
3024 Options.LoadSizes = {4, 2, 1};
3025 Options.AllowedTailExpansions = {3};
3026 }
3027
3028 if (IsZeroCmp && ST->hasVInstructions()) {
3029 unsigned VLenB = ST->getRealMinVLen() / 8;
3030 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3031 // `VLenB * MaxLMUL` so that it fits in a single register group.
3032 unsigned MinSize = ST->getXLen() / 8 + 1;
3033 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3034 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3035 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3036 }
3037 return Options;
3038}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition: Debug.h:119
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
Definition: BasicTTIImpl.h:558
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:888
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
Definition: BasicTTIImpl.h:459
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
Definition: BasicTTIImpl.h:879
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Definition: BasicTTIImpl.h:702
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition: BasicTTIImpl.h:774
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:997
bool isLegalAddImmediate(int64_t imm) const override
Definition: BasicTTIImpl.h:447
std::optional< unsigned > getVScaleForTuning() const override
Definition: BasicTTIImpl.h:880
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
unsigned getRegUsageForType(Type *Ty) const override
Definition: BasicTTIImpl.h:553
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:681
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:695
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:707
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:684
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:693
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:682
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:683
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:692
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:686
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:689
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:690
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:685
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:687
@ ICMP_EQ
equal
Definition: InstrTypes.h:699
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:694
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:691
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:680
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:688
bool isFPPredicate() const
Definition: InstrTypes.h:784
bool isIntPredicate() const
Definition: InstrTypes.h:785
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:481
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:842
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:674
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:468
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:846
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:22
bool noNaNs() const
Definition: FMF.h:65
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
unsigned getNumElements() const
Definition: DerivedTypes.h:635
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
Definition: DerivedTypes.h:627
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:949
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:319
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:43
The optimization diagnostic interface.
unsigned getMaxLMULForFixedLengthVectors() const
bool hasVInstructionsF64() const
unsigned getRealMinVLen() const
bool useRVVForFixedLengthVectors() const
bool hasVInstructionsBF16Minimal() const
bool hasVInstructionsF16Minimal() const
unsigned getXLen() const
bool hasConditionalMoveFusion() const
bool hasVInstructionsF16() const
bool hasVInstructions() const
std::optional< unsigned > getRealVLen() const
bool hasOptimizedSegmentLoadStore(unsigned NF) const
unsigned getRealMaxVLen() const
bool hasVInstructionsF32() const
unsigned getELen() const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *Src, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
MVT getContainerForFixedLengthVector(MVT VT) const
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace, const DataLayout &) const
Returns whether or not generating a interleaved load/store intrinsic for this type will be legal.
static RISCVVType::VLMUL getLMUL(MVT VT)
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:283
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
MVT getTypeToPromoteTo(unsigned Op, MVT VT) const
If the action for this operation is to promote, this method returns the ValueType to promote to.
virtual const DataLayout & getDataLayout() const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:349
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:273
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
Value * getOperand(unsigned i) const
Definition: User.h:232
LLVM Value Representation.
Definition: Value.h:75
Base class of all SIMD vector types.
Definition: DerivedTypes.h:430
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:695
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:463
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:194
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition: TypeSize.h:184
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:233
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:219
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:169
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:255
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:862
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:826
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:1002
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:832
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:960
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:908
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:838
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:349
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1121
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:157
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:282
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1854
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
InstructionCost Cost
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2107
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:280
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:858
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:216
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).