LLVM 22.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include "llvm/IR/IntrinsicsRISCV.h"
20#include <cmath>
21#include <optional>
22using namespace llvm;
23using namespace llvm::PatternMatch;
24
25#define DEBUG_TYPE "riscvtti"
26
28 "riscv-v-register-bit-width-lmul",
30 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
31 "by autovectorized code. Fractional LMULs are not supported."),
33
35 "riscv-v-slp-max-vf",
37 "Overrides result used for getMaximumVF query which is used "
38 "exclusively by SLP vectorizer."),
40
42 RVVMinTripCount("riscv-v-min-trip-count",
43 cl::desc("Set the lower bound of a trip count to decide on "
44 "vectorization while tail-folding."),
46
48RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
50 // Check if the type is valid for all CostKind
51 if (!VT.isVector())
53 size_t NumInstr = OpCodes.size();
55 return NumInstr;
56 InstructionCost LMULCost = TLI->getLMULCost(VT);
58 return LMULCost * NumInstr;
59 InstructionCost Cost = 0;
60 for (auto Op : OpCodes) {
61 switch (Op) {
62 case RISCV::VRGATHER_VI:
63 Cost += TLI->getVRGatherVICost(VT);
64 break;
65 case RISCV::VRGATHER_VV:
66 Cost += TLI->getVRGatherVVCost(VT);
67 break;
68 case RISCV::VSLIDEUP_VI:
69 case RISCV::VSLIDEDOWN_VI:
70 Cost += TLI->getVSlideVICost(VT);
71 break;
72 case RISCV::VSLIDEUP_VX:
73 case RISCV::VSLIDEDOWN_VX:
74 Cost += TLI->getVSlideVXCost(VT);
75 break;
76 case RISCV::VREDMAX_VS:
77 case RISCV::VREDMIN_VS:
78 case RISCV::VREDMAXU_VS:
79 case RISCV::VREDMINU_VS:
80 case RISCV::VREDSUM_VS:
81 case RISCV::VREDAND_VS:
82 case RISCV::VREDOR_VS:
83 case RISCV::VREDXOR_VS:
84 case RISCV::VFREDMAX_VS:
85 case RISCV::VFREDMIN_VS:
86 case RISCV::VFREDUSUM_VS: {
87 unsigned VL = VT.getVectorMinNumElements();
88 if (!VT.isFixedLengthVector())
89 VL *= *getVScaleForTuning();
90 Cost += Log2_32_Ceil(VL);
91 break;
92 }
93 case RISCV::VFREDOSUM_VS: {
94 unsigned VL = VT.getVectorMinNumElements();
95 if (!VT.isFixedLengthVector())
96 VL *= *getVScaleForTuning();
97 Cost += VL;
98 break;
99 }
100 case RISCV::VMV_X_S:
101 case RISCV::VMV_S_X:
102 case RISCV::VFMV_F_S:
103 case RISCV::VFMV_S_F:
104 case RISCV::VMOR_MM:
105 case RISCV::VMXOR_MM:
106 case RISCV::VMAND_MM:
107 case RISCV::VMANDN_MM:
108 case RISCV::VMNAND_MM:
109 case RISCV::VCPOP_M:
110 case RISCV::VFIRST_M:
111 Cost += 1;
112 break;
113 default:
114 Cost += LMULCost;
115 }
116 }
117 return Cost;
118}
119
121 const RISCVSubtarget *ST,
122 const APInt &Imm, Type *Ty,
124 bool FreeZeroes) {
125 assert(Ty->isIntegerTy() &&
126 "getIntImmCost can only estimate cost of materialising integers");
127
128 // We have a Zero register, so 0 is always free.
129 if (Imm == 0)
130 return TTI::TCC_Free;
131
132 // Otherwise, we check how many instructions it will take to materialise.
133 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
134 /*CompressionCost=*/false, FreeZeroes);
135}
136
140 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
141}
142
143// Look for patterns of shift followed by AND that can be turned into a pair of
144// shifts. We won't need to materialize an immediate for the AND so these can
145// be considered free.
146static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
147 uint64_t Mask = Imm.getZExtValue();
148 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
149 if (!BO || !BO->hasOneUse())
150 return false;
151
152 if (BO->getOpcode() != Instruction::Shl)
153 return false;
154
155 if (!isa<ConstantInt>(BO->getOperand(1)))
156 return false;
157
158 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
159 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
160 // is a mask shifted by c2 bits with c3 leading zeros.
161 if (isShiftedMask_64(Mask)) {
162 unsigned Trailing = llvm::countr_zero(Mask);
163 if (ShAmt == Trailing)
164 return true;
165 }
166
167 return false;
168}
169
170// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
171// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
172// the type will be split so only the lower 32 bits need to be compared using
173// (srai/srli X, C) == C2.
174static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
175 if (!Inst->hasOneUse())
176 return false;
177
178 // Look for equality comparison.
179 auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
180 if (!Cmp || !Cmp->isEquality())
181 return false;
182
183 // Right hand side of comparison should be a constant.
184 auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
185 if (!C)
186 return false;
187
188 uint64_t Mask = Imm.getZExtValue();
189
190 // Mask should be of the form -(1 << C) in the lower 32 bits.
191 if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
192 return false;
193
194 // Comparison constant should be a subset of Mask.
195 uint64_t CmpC = C->getZExtValue();
196 if ((CmpC & Mask) != CmpC)
197 return false;
198
199 // We'll need to sign extend the comparison constant and shift it right. Make
200 // sure the new constant can use addi/xori+seqz/snez.
201 unsigned ShiftBits = llvm::countr_zero(Mask);
202 int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
203 return NewCmpC >= -2048 && NewCmpC <= 2048;
204}
205
207 const APInt &Imm, Type *Ty,
209 Instruction *Inst) const {
210 assert(Ty->isIntegerTy() &&
211 "getIntImmCost can only estimate cost of materialising integers");
212
213 // We have a Zero register, so 0 is always free.
214 if (Imm == 0)
215 return TTI::TCC_Free;
216
217 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
218 // commutative, in others the immediate comes from a specific argument index.
219 bool Takes12BitImm = false;
220 unsigned ImmArgIdx = ~0U;
221
222 switch (Opcode) {
223 case Instruction::GetElementPtr:
224 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
225 // split up large offsets in GEP into better parts than ConstantHoisting
226 // can.
227 return TTI::TCC_Free;
228 case Instruction::Store: {
229 // Use the materialization cost regardless of if it's the address or the
230 // value that is constant, except for if the store is misaligned and
231 // misaligned accesses are not legal (experience shows constant hoisting
232 // can sometimes be harmful in such cases).
233 if (Idx == 1 || !Inst)
234 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
235 /*FreeZeroes=*/true);
236
237 StoreInst *ST = cast<StoreInst>(Inst);
238 if (!getTLI()->allowsMemoryAccessForAlignment(
239 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
240 ST->getPointerAddressSpace(), ST->getAlign()))
241 return TTI::TCC_Free;
242
243 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
244 /*FreeZeroes=*/true);
245 }
246 case Instruction::Load:
247 // If the address is a constant, use the materialization cost.
248 return getIntImmCost(Imm, Ty, CostKind);
249 case Instruction::And:
250 // zext.h
251 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
252 return TTI::TCC_Free;
253 // zext.w
254 if (Imm == UINT64_C(0xffffffff) &&
255 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
256 return TTI::TCC_Free;
257 // bclri
258 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
259 return TTI::TCC_Free;
260 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
261 canUseShiftPair(Inst, Imm))
262 return TTI::TCC_Free;
263 if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
264 canUseShiftCmp(Inst, Imm))
265 return TTI::TCC_Free;
266 Takes12BitImm = true;
267 break;
268 case Instruction::Add:
269 Takes12BitImm = true;
270 break;
271 case Instruction::Or:
272 case Instruction::Xor:
273 // bseti/binvi
274 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
275 return TTI::TCC_Free;
276 Takes12BitImm = true;
277 break;
278 case Instruction::Mul:
279 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
280 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
281 return TTI::TCC_Free;
282 // One more or less than a power of 2 can use SLLI+ADD/SUB.
283 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
284 return TTI::TCC_Free;
285 // FIXME: There is no MULI instruction.
286 Takes12BitImm = true;
287 break;
288 case Instruction::Sub:
289 case Instruction::Shl:
290 case Instruction::LShr:
291 case Instruction::AShr:
292 Takes12BitImm = true;
293 ImmArgIdx = 1;
294 break;
295 default:
296 break;
297 }
298
299 if (Takes12BitImm) {
300 // Check immediate is the correct argument...
301 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
302 // ... and fits into the 12-bit immediate.
303 if (Imm.getSignificantBits() <= 64 &&
304 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
305 return TTI::TCC_Free;
306 }
307 }
308
309 // Otherwise, use the full materialisation cost.
310 return getIntImmCost(Imm, Ty, CostKind);
311 }
312
313 // By default, prevent hoisting.
314 return TTI::TCC_Free;
315}
316
319 const APInt &Imm, Type *Ty,
321 // Prevent hoisting in unknown cases.
322 return TTI::TCC_Free;
323}
324
326 return ST->hasVInstructions();
327}
328
330RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
331 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
332 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
333}
334
336 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
338 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
340
341 // zve32x is broken for partial_reduce_umla, but let's make sure we
342 // don't generate them.
343 if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 ||
344 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
345 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
346 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
348
349 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
350 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
351 // Note: Asuming all vqdot* variants are equal cost
352 return LT.first *
353 getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second, CostKind);
354}
355
357 // Currently, the ExpandReductions pass can't expand scalable-vector
358 // reductions, but we still request expansion as RVV doesn't support certain
359 // reductions and the SelectionDAG can't legalize them either.
360 switch (II->getIntrinsicID()) {
361 default:
362 return false;
363 // These reductions have no equivalent in RVV
364 case Intrinsic::vector_reduce_mul:
365 case Intrinsic::vector_reduce_fmul:
366 return true;
367 }
368}
369
370std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
371 if (ST->hasVInstructions())
372 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
373 return BaseT::getMaxVScale();
374}
375
376std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
377 if (ST->hasVInstructions())
378 if (unsigned MinVLen = ST->getRealMinVLen();
379 MinVLen >= RISCV::RVVBitsPerBlock)
380 return MinVLen / RISCV::RVVBitsPerBlock;
382}
383
386 unsigned LMUL =
387 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
388 switch (K) {
390 return TypeSize::getFixed(ST->getXLen());
392 return TypeSize::getFixed(
393 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
396 (ST->hasVInstructions() &&
397 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
399 : 0);
400 }
401
402 llvm_unreachable("Unsupported register kind");
403}
404
406RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
408 // Add a cost of address generation + the cost of the load. The address
409 // is expected to be a PC relative offset to a constant pool entry
410 // using auipc/addi.
411 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
412 /*AddressSpace=*/0, CostKind);
413}
414
415static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
416 unsigned Size = Mask.size();
417 if (!isPowerOf2_32(Size))
418 return false;
419 for (unsigned I = 0; I != Size; ++I) {
420 if (static_cast<unsigned>(Mask[I]) == I)
421 continue;
422 if (Mask[I] != 0)
423 return false;
424 if (Size % I != 0)
425 return false;
426 for (unsigned J = I + 1; J != Size; ++J)
427 // Check the pattern is repeated.
428 if (static_cast<unsigned>(Mask[J]) != J % I)
429 return false;
430 SubVectorSize = I;
431 return true;
432 }
433 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
434 return false;
435}
436
438 LLVMContext &C) {
439 assert((DataVT.getScalarSizeInBits() != 8 ||
440 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
441 MVT IndexVT = DataVT.changeTypeToInteger();
442 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
443 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
444 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
445}
446
447/// Attempt to approximate the cost of a shuffle which will require splitting
448/// during legalization. Note that processShuffleMasks is not an exact proxy
449/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
450/// reasonably close upperbound.
452 MVT LegalVT, VectorType *Tp,
453 ArrayRef<int> Mask,
455 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
456 "Expected fixed vector type and non-empty mask");
457 unsigned LegalNumElts = LegalVT.getVectorNumElements();
458 // Number of destination vectors after legalization:
459 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
460 // We are going to permute multiple sources and the result will be in
461 // multiple destinations. Providing an accurate cost only for splits where
462 // the element type remains the same.
463 if (NumOfDests <= 1 ||
465 Tp->getElementType()->getPrimitiveSizeInBits() ||
466 LegalNumElts >= Tp->getElementCount().getFixedValue())
468
469 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
470 unsigned LegalVTSize = LegalVT.getStoreSize();
471 // Number of source vectors after legalization:
472 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
473
474 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
475
476 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
477 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
478 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
479 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
480 assert(NormalizedVF >= Mask.size() &&
481 "Normalized mask expected to be not shorter than original mask.");
482 copy(Mask, NormalizedMask.begin());
483 InstructionCost Cost = 0;
484 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
486 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
487 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
488 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
489 return;
490 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
491 .second)
492 return;
493 Cost += TTI.getShuffleCost(
495 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
496 SingleOpTy, RegMask, CostKind, 0, nullptr);
497 },
498 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
499 Cost += TTI.getShuffleCost(
501 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
502 SingleOpTy, RegMask, CostKind, 0, nullptr);
503 });
504 return Cost;
505}
506
507/// Try to perform better estimation of the permutation.
508/// 1. Split the source/destination vectors into real registers.
509/// 2. Do the mask analysis to identify which real registers are
510/// permuted. If more than 1 source registers are used for the
511/// destination register building, the cost for this destination register
512/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
513/// source register is used, build mask and calculate the cost as a cost
514/// of PermuteSingleSrc.
515/// Also, for the single register permute we try to identify if the
516/// destination register is just a copy of the source register or the
517/// copy of the previous destination register (the cost is
518/// TTI::TCC_Basic). If the source register is just reused, the cost for
519/// this operation is 0.
520static InstructionCost
522 std::optional<unsigned> VLen, VectorType *Tp,
524 assert(LegalVT.isFixedLengthVector());
525 if (!VLen || Mask.empty())
527 MVT ElemVT = LegalVT.getVectorElementType();
528 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
529 LegalVT = TTI.getTypeLegalizationCost(
530 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
531 .second;
532 // Number of destination vectors after legalization:
533 InstructionCost NumOfDests =
534 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
535 if (NumOfDests <= 1 ||
537 Tp->getElementType()->getPrimitiveSizeInBits() ||
538 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
540
541 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
542 unsigned LegalVTSize = LegalVT.getStoreSize();
543 // Number of source vectors after legalization:
544 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
545
546 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
547 LegalVT.getVectorNumElements());
548
549 unsigned E = NumOfDests.getValue();
550 unsigned NormalizedVF =
551 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
552 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
553 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
554 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
555 assert(NormalizedVF >= Mask.size() &&
556 "Normalized mask expected to be not shorter than original mask.");
557 copy(Mask, NormalizedMask.begin());
558 InstructionCost Cost = 0;
559 int NumShuffles = 0;
560 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
562 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
563 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
564 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
565 return;
566 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
567 .second)
568 return;
569 ++NumShuffles;
570 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
571 SingleOpTy, RegMask, CostKind, 0, nullptr);
572 },
573 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
574 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
575 SingleOpTy, RegMask, CostKind, 0, nullptr);
576 NumShuffles += 2;
577 });
578 // Note: check that we do not emit too many shuffles here to prevent code
579 // size explosion.
580 // TODO: investigate, if it can be improved by extra analysis of the masks
581 // to check if the code is more profitable.
582 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
583 (NumOfDestRegs <= 2 && NumShuffles < 4))
584 return Cost;
586}
587
588InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
589 ArrayRef<int> Mask,
591 // Avoid missing masks and length changing shuffles
592 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
594
595 int NumElts = Tp->getNumElements();
596 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
597 // Avoid scalarization cases
598 if (!LT.second.isFixedLengthVector())
600
601 // Requires moving elements between parts, which requires additional
602 // unmodeled instructions.
603 if (LT.first != 1)
605
606 auto GetSlideOpcode = [&](int SlideAmt) {
607 assert(SlideAmt != 0);
608 bool IsVI = isUInt<5>(std::abs(SlideAmt));
609 if (SlideAmt < 0)
610 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
611 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
612 };
613
614 std::array<std::pair<int, int>, 2> SrcInfo;
615 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
617
618 if (SrcInfo[1].second == 0)
619 std::swap(SrcInfo[0], SrcInfo[1]);
620
621 InstructionCost FirstSlideCost = 0;
622 if (SrcInfo[0].second != 0) {
623 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
624 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
625 }
626
627 if (SrcInfo[1].first == -1)
628 return FirstSlideCost;
629
630 InstructionCost SecondSlideCost = 0;
631 if (SrcInfo[1].second != 0) {
632 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
633 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
634 } else {
635 SecondSlideCost =
636 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
637 }
638
639 auto EC = Tp->getElementCount();
640 VectorType *MaskTy =
642 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
643 return FirstSlideCost + SecondSlideCost + MaskCost;
644}
645
648 VectorType *SrcTy, ArrayRef<int> Mask,
649 TTI::TargetCostKind CostKind, int Index,
651 const Instruction *CxtI) const {
652 assert((Mask.empty() || DstTy->isScalableTy() ||
653 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
654 "Expected the Mask to match the return size if given");
655 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
656 "Expected the same scalar types");
657
658 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
659 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
660
661 // First, handle cases where having a fixed length vector enables us to
662 // give a more accurate cost than falling back to generic scalable codegen.
663 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
664 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
665 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
667 *this, LT.second, ST->getRealVLen(),
668 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
669 if (VRegSplittingCost.isValid())
670 return VRegSplittingCost;
671 switch (Kind) {
672 default:
673 break;
675 if (Mask.size() >= 2) {
676 MVT EltTp = LT.second.getVectorElementType();
677 // If the size of the element is < ELEN then shuffles of interleaves and
678 // deinterleaves of 2 vectors can be lowered into the following
679 // sequences
680 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
681 // Example sequence:
682 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
683 // vwaddu.vv v10, v8, v9
684 // li a0, -1 (ignored)
685 // vwmaccu.vx v10, a0, v9
686 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
687 return 2 * LT.first * TLI->getLMULCost(LT.second);
688
689 if (Mask[0] == 0 || Mask[0] == 1) {
690 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
691 // Example sequence:
692 // vnsrl.wi v10, v8, 0
693 if (equal(DeinterleaveMask, Mask))
694 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
695 LT.second, CostKind);
696 }
697 }
698 int SubVectorSize;
699 if (LT.second.getScalarSizeInBits() != 1 &&
700 isRepeatedConcatMask(Mask, SubVectorSize)) {
702 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
703 // The cost of extraction from a subvector is 0 if the index is 0.
704 for (unsigned I = 0; I != NumSlides; ++I) {
705 unsigned InsertIndex = SubVectorSize * (1 << I);
706 FixedVectorType *SubTp =
707 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
708 FixedVectorType *DestTp =
710 std::pair<InstructionCost, MVT> DestLT =
712 // Add the cost of whole vector register move because the
713 // destination vector register group for vslideup cannot overlap the
714 // source.
715 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
716 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
717 CostKind, InsertIndex, SubTp);
718 }
719 return Cost;
720 }
721 }
722
723 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
724 SlideCost.isValid())
725 return SlideCost;
726
727 // vrgather + cost of generating the mask constant.
728 // We model this for an unknown mask with a single vrgather.
729 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
730 LT.second.getVectorNumElements() <= 256)) {
731 VectorType *IdxTy =
732 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
733 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
734 return IndexCost +
735 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
736 }
737 break;
738 }
741
742 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
743 SlideCost.isValid())
744 return SlideCost;
745
746 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
747 // register for the second vrgather. We model this for an unknown
748 // (shuffle) mask.
749 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
750 LT.second.getVectorNumElements() <= 256)) {
751 auto &C = SrcTy->getContext();
752 auto EC = SrcTy->getElementCount();
753 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
755 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
756 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
757 return 2 * IndexCost +
758 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
759 LT.second, CostKind) +
760 MaskCost;
761 }
762 break;
763 }
764 }
765
766 auto shouldSplit = [](TTI::ShuffleKind Kind) {
767 switch (Kind) {
768 default:
769 return false;
773 return true;
774 }
775 };
776
777 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
778 shouldSplit(Kind)) {
779 InstructionCost SplitCost =
780 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
781 if (SplitCost.isValid())
782 return SplitCost;
783 }
784 }
785
786 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
787 switch (Kind) {
788 default:
789 // Fallthrough to generic handling.
790 // TODO: Most of these cases will return getInvalid in generic code, and
791 // must be implemented here.
792 break;
794 // Extract at zero is always a subregister extract
795 if (Index == 0)
796 return TTI::TCC_Free;
797
798 // If we're extracting a subvector of at most m1 size at a sub-register
799 // boundary - which unfortunately we need exact vlen to identify - this is
800 // a subregister extract at worst and thus won't require a vslidedown.
801 // TODO: Extend for aligned m2, m4 subvector extracts
802 // TODO: Extend for misalgined (but contained) extracts
803 // TODO: Extend for scalable subvector types
804 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
805 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
806 if (std::optional<unsigned> VLen = ST->getRealVLen();
807 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
808 SubLT.second.getSizeInBits() <= *VLen)
809 return TTI::TCC_Free;
810 }
811
812 // Example sequence:
813 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
814 // vslidedown.vi v8, v9, 2
815 return LT.first *
816 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
818 // Example sequence:
819 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
820 // vslideup.vi v8, v9, 2
821 LT = getTypeLegalizationCost(DstTy);
822 return LT.first *
823 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
824 case TTI::SK_Select: {
825 // Example sequence:
826 // li a0, 90
827 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
828 // vmv.s.x v0, a0
829 // vmerge.vvm v8, v9, v8, v0
830 // We use 2 for the cost of the mask materialization as this is the true
831 // cost for small masks and most shuffles are small. At worst, this cost
832 // should be a very small constant for the constant pool load. As such,
833 // we may bias towards large selects slightly more than truly warranted.
834 return LT.first *
835 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
836 LT.second, CostKind));
837 }
838 case TTI::SK_Broadcast: {
839 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
840 Instruction::InsertElement);
841 if (LT.second.getScalarSizeInBits() == 1) {
842 if (HasScalar) {
843 // Example sequence:
844 // andi a0, a0, 1
845 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
846 // vmv.v.x v8, a0
847 // vmsne.vi v0, v8, 0
848 return LT.first *
849 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
850 LT.second, CostKind));
851 }
852 // Example sequence:
853 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
854 // vmv.v.i v8, 0
855 // vmerge.vim v8, v8, 1, v0
856 // vmv.x.s a0, v8
857 // andi a0, a0, 1
858 // vmv.v.x v8, a0
859 // vmsne.vi v0, v8, 0
860
861 return LT.first *
862 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
863 RISCV::VMV_X_S, RISCV::VMV_V_X,
864 RISCV::VMSNE_VI},
865 LT.second, CostKind));
866 }
867
868 if (HasScalar) {
869 // Example sequence:
870 // vmv.v.x v8, a0
871 return LT.first *
872 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
873 }
874
875 // Example sequence:
876 // vrgather.vi v9, v8, 0
877 return LT.first *
878 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
879 }
880 case TTI::SK_Splice: {
881 // vslidedown+vslideup.
882 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
883 // of similar code, but I think we expand through memory.
884 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
885 if (Index >= 0 && Index < 32)
886 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
887 else if (Index < 0 && Index > -32)
888 Opcodes[1] = RISCV::VSLIDEUP_VI;
889 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
890 }
891 case TTI::SK_Reverse: {
892
893 if (!LT.second.isVector())
895
896 // TODO: Cases to improve here:
897 // * Illegal vector types
898 // * i64 on RV32
899 if (SrcTy->getElementType()->isIntegerTy(1)) {
900 VectorType *WideTy =
901 VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
902 cast<VectorType>(SrcTy)->getElementCount());
903 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
905 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
906 nullptr) +
907 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
909 }
910
911 MVT ContainerVT = LT.second;
912 if (LT.second.isFixedLengthVector())
913 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
914 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
915 if (ContainerVT.bitsLE(M1VT)) {
916 // Example sequence:
917 // csrr a0, vlenb
918 // srli a0, a0, 3
919 // addi a0, a0, -1
920 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
921 // vid.v v9
922 // vrsub.vx v10, v9, a0
923 // vrgather.vv v9, v8, v10
924 InstructionCost LenCost = 3;
925 if (LT.second.isFixedLengthVector())
926 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
927 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
928 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
929 if (LT.second.isFixedLengthVector() &&
930 isInt<5>(LT.second.getVectorNumElements() - 1))
931 Opcodes[1] = RISCV::VRSUB_VI;
932 InstructionCost GatherCost =
933 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
934 return LT.first * (LenCost + GatherCost);
935 }
936
937 // At high LMUL, we split into a series of M1 reverses (see
938 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
939 // the resulting gap at the bottom (for fixed vectors only). The important
940 // bit is that the cost scales linearly, not quadratically with LMUL.
941 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
942 InstructionCost FixedCost =
943 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
944 unsigned Ratio =
946 InstructionCost GatherCost =
947 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
948 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
949 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
950 return FixedCost + LT.first * (GatherCost + SlideCost);
951 }
952 }
953 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
954 SubTp);
955}
956
957static unsigned isM1OrSmaller(MVT VT) {
959 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
963}
964
966 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
967 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
968 ArrayRef<Value *> VL) const {
971
972 // A build_vector (which is m1 sized or smaller) can be done in no
973 // worse than one vslide1down.vx per element in the type. We could
974 // in theory do an explode_vector in the inverse manner, but our
975 // lowering today does not have a first class node for this pattern.
977 Ty, DemandedElts, Insert, Extract, CostKind);
978 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
979 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
980 if (Ty->getScalarSizeInBits() == 1) {
981 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
982 // Note: Implicit scalar anyextend is assumed to be free since the i1
983 // must be stored in a GPR.
984 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
985 CostKind) +
986 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
988 }
989
990 assert(LT.second.isFixedLengthVector());
991 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
992 if (isM1OrSmaller(ContainerVT)) {
993 InstructionCost BV =
994 cast<FixedVectorType>(Ty)->getNumElements() *
995 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
996 if (BV < Cost)
997 Cost = BV;
998 }
999 }
1000 return Cost;
1001}
1002
1004RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1005 unsigned AddressSpace,
1007 if (!isLegalMaskedLoadStore(Src, Alignment) ||
1009 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1010 CostKind);
1011
1012 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1013}
1014
1016 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1017 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1018 bool UseMaskForCond, bool UseMaskForGaps) const {
1019
1020 // The interleaved memory access pass will lower (de)interleave ops combined
1021 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
1022 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
1023 // gap).
1024 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1025 auto *VTy = cast<VectorType>(VecTy);
1026 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
1027 // Need to make sure type has't been scalarized
1028 if (LT.second.isVector()) {
1029 auto *SubVecTy =
1030 VectorType::get(VTy->getElementType(),
1031 VTy->getElementCount().divideCoefficientBy(Factor));
1032 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
1033 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
1034 AddressSpace, DL)) {
1035
1036 // Some processors optimize segment loads/stores as one wide memory op +
1037 // Factor * LMUL shuffle ops.
1038 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1040 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1041 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1042 Cost += Factor * TLI->getLMULCost(SubVecVT);
1043 return LT.first * Cost;
1044 }
1045
1046 // Otherwise, the cost is proportional to the number of elements (VL *
1047 // Factor ops).
1048 InstructionCost MemOpCost =
1049 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1050 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1051 unsigned NumLoads = getEstimatedVLFor(VTy);
1052 return NumLoads * MemOpCost;
1053 }
1054 }
1055 }
1056
1057 // TODO: Return the cost of interleaved accesses for scalable vector when
1058 // unable to convert to segment accesses instructions.
1059 if (isa<ScalableVectorType>(VecTy))
1061
1062 auto *FVTy = cast<FixedVectorType>(VecTy);
1063 InstructionCost MemCost =
1064 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1065 unsigned VF = FVTy->getNumElements() / Factor;
1066
1067 // An interleaved load will look like this for Factor=3:
1068 // %wide.vec = load <12 x i32>, ptr %3, align 4
1069 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1070 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1071 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1072 if (Opcode == Instruction::Load) {
1073 InstructionCost Cost = MemCost;
1074 for (unsigned Index : Indices) {
1075 FixedVectorType *VecTy =
1076 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1077 auto Mask = createStrideMask(Index, Factor, VF);
1078 Mask.resize(VF * Factor, -1);
1079 InstructionCost ShuffleCost =
1081 Mask, CostKind, 0, nullptr, {});
1082 Cost += ShuffleCost;
1083 }
1084 return Cost;
1085 }
1086
1087 // TODO: Model for NF > 2
1088 // We'll need to enhance getShuffleCost to model shuffles that are just
1089 // inserts and extracts into subvectors, since they won't have the full cost
1090 // of a vrgather.
1091 // An interleaved store for 3 vectors of 4 lanes will look like
1092 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1093 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1094 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1095 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1096 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1097 if (Factor != 2)
1098 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1099 Alignment, AddressSpace, CostKind,
1100 UseMaskForCond, UseMaskForGaps);
1101
1102 assert(Opcode == Instruction::Store && "Opcode must be a store");
1103 // For an interleaving store of 2 vectors, we perform one large interleaving
1104 // shuffle that goes into the wide store
1105 auto Mask = createInterleaveMask(VF, Factor);
1106 InstructionCost ShuffleCost =
1108 CostKind, 0, nullptr, {});
1109 return MemCost + ShuffleCost;
1110}
1111
1113 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1114 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1116 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1117 Alignment, CostKind, I);
1118
1119 if ((Opcode == Instruction::Load &&
1120 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1121 (Opcode == Instruction::Store &&
1122 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1123 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1124 Alignment, CostKind, I);
1125
1126 // Cost is proportional to the number of memory operations implied. For
1127 // scalable vectors, we use an estimate on that number since we don't
1128 // know exactly what VL will be.
1129 auto &VTy = *cast<VectorType>(DataTy);
1130 InstructionCost MemOpCost =
1131 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1132 {TTI::OK_AnyValue, TTI::OP_None}, I);
1133 unsigned NumLoads = getEstimatedVLFor(&VTy);
1134 return NumLoads * MemOpCost;
1135}
1136
1138 unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
1139 TTI::TargetCostKind CostKind, const Instruction *I) const {
1140 bool IsLegal = (Opcode == Instruction::Store &&
1141 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1142 (Opcode == Instruction::Load &&
1143 isLegalMaskedExpandLoad(DataTy, Alignment));
1144 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1145 return BaseT::getExpandCompressMemoryOpCost(Opcode, DataTy, VariableMask,
1146 Alignment, CostKind, I);
1147 // Example compressstore sequence:
1148 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1149 // vcompress.vm v10, v8, v0
1150 // vcpop.m a1, v0
1151 // vsetvli zero, a1, e32, m2, ta, ma
1152 // vse32.v v10, (a0)
1153 // Example expandload sequence:
1154 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1155 // vcpop.m a1, v0
1156 // vsetvli zero, a1, e32, m2, ta, ma
1157 // vle32.v v10, (a0)
1158 // vsetivli zero, 8, e32, m2, ta, ma
1159 // viota.m v12, v0
1160 // vrgather.vv v8, v10, v12, v0.t
1161 auto MemOpCost =
1162 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1163 auto LT = getTypeLegalizationCost(DataTy);
1164 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1165 if (VariableMask)
1166 Opcodes.push_back(RISCV::VCPOP_M);
1167 if (Opcode == Instruction::Store)
1168 Opcodes.append({RISCV::VCOMPRESS_VM});
1169 else
1170 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1171 return MemOpCost +
1172 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1173}
1174
1176 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1177 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1178 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1179 !isLegalStridedLoadStore(DataTy, Alignment)) ||
1180 (Opcode != Instruction::Load && Opcode != Instruction::Store))
1181 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
1182 Alignment, CostKind, I);
1183
1185 return TTI::TCC_Basic;
1186
1187 // Cost is proportional to the number of memory operations implied. For
1188 // scalable vectors, we use an estimate on that number since we don't
1189 // know exactly what VL will be.
1190 auto &VTy = *cast<VectorType>(DataTy);
1191 InstructionCost MemOpCost =
1192 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1193 {TTI::OK_AnyValue, TTI::OP_None}, I);
1194 unsigned NumLoads = getEstimatedVLFor(&VTy);
1195 return NumLoads * MemOpCost;
1196}
1197
1200 // FIXME: This is a property of the default vector convention, not
1201 // all possible calling conventions. Fixing that will require
1202 // some TTI API and SLP rework.
1205 for (auto *Ty : Tys) {
1206 if (!Ty->isVectorTy())
1207 continue;
1208 Align A = DL.getPrefTypeAlign(Ty);
1209 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1210 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1211 }
1212 return Cost;
1213}
1214
1215// Currently, these represent both throughput and codesize costs
1216// for the respective intrinsics. The costs in this table are simply
1217// instruction counts with the following adjustments made:
1218// * One vsetvli is considered free.
1220 {Intrinsic::floor, MVT::f32, 9},
1221 {Intrinsic::floor, MVT::f64, 9},
1222 {Intrinsic::ceil, MVT::f32, 9},
1223 {Intrinsic::ceil, MVT::f64, 9},
1224 {Intrinsic::trunc, MVT::f32, 7},
1225 {Intrinsic::trunc, MVT::f64, 7},
1226 {Intrinsic::round, MVT::f32, 9},
1227 {Intrinsic::round, MVT::f64, 9},
1228 {Intrinsic::roundeven, MVT::f32, 9},
1229 {Intrinsic::roundeven, MVT::f64, 9},
1230 {Intrinsic::rint, MVT::f32, 7},
1231 {Intrinsic::rint, MVT::f64, 7},
1232 {Intrinsic::nearbyint, MVT::f32, 9},
1233 {Intrinsic::nearbyint, MVT::f64, 9},
1234 {Intrinsic::bswap, MVT::i16, 3},
1235 {Intrinsic::bswap, MVT::i32, 12},
1236 {Intrinsic::bswap, MVT::i64, 31},
1237 {Intrinsic::vp_bswap, MVT::i16, 3},
1238 {Intrinsic::vp_bswap, MVT::i32, 12},
1239 {Intrinsic::vp_bswap, MVT::i64, 31},
1240 {Intrinsic::vp_fshl, MVT::i8, 7},
1241 {Intrinsic::vp_fshl, MVT::i16, 7},
1242 {Intrinsic::vp_fshl, MVT::i32, 7},
1243 {Intrinsic::vp_fshl, MVT::i64, 7},
1244 {Intrinsic::vp_fshr, MVT::i8, 7},
1245 {Intrinsic::vp_fshr, MVT::i16, 7},
1246 {Intrinsic::vp_fshr, MVT::i32, 7},
1247 {Intrinsic::vp_fshr, MVT::i64, 7},
1248 {Intrinsic::bitreverse, MVT::i8, 17},
1249 {Intrinsic::bitreverse, MVT::i16, 24},
1250 {Intrinsic::bitreverse, MVT::i32, 33},
1251 {Intrinsic::bitreverse, MVT::i64, 52},
1252 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1253 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1254 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1255 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1256 {Intrinsic::ctpop, MVT::i8, 12},
1257 {Intrinsic::ctpop, MVT::i16, 19},
1258 {Intrinsic::ctpop, MVT::i32, 20},
1259 {Intrinsic::ctpop, MVT::i64, 21},
1260 {Intrinsic::ctlz, MVT::i8, 19},
1261 {Intrinsic::ctlz, MVT::i16, 28},
1262 {Intrinsic::ctlz, MVT::i32, 31},
1263 {Intrinsic::ctlz, MVT::i64, 35},
1264 {Intrinsic::cttz, MVT::i8, 16},
1265 {Intrinsic::cttz, MVT::i16, 23},
1266 {Intrinsic::cttz, MVT::i32, 24},
1267 {Intrinsic::cttz, MVT::i64, 25},
1268 {Intrinsic::vp_ctpop, MVT::i8, 12},
1269 {Intrinsic::vp_ctpop, MVT::i16, 19},
1270 {Intrinsic::vp_ctpop, MVT::i32, 20},
1271 {Intrinsic::vp_ctpop, MVT::i64, 21},
1272 {Intrinsic::vp_ctlz, MVT::i8, 19},
1273 {Intrinsic::vp_ctlz, MVT::i16, 28},
1274 {Intrinsic::vp_ctlz, MVT::i32, 31},
1275 {Intrinsic::vp_ctlz, MVT::i64, 35},
1276 {Intrinsic::vp_cttz, MVT::i8, 16},
1277 {Intrinsic::vp_cttz, MVT::i16, 23},
1278 {Intrinsic::vp_cttz, MVT::i32, 24},
1279 {Intrinsic::vp_cttz, MVT::i64, 25},
1280};
1281
1285 auto *RetTy = ICA.getReturnType();
1286 switch (ICA.getID()) {
1287 case Intrinsic::lrint:
1288 case Intrinsic::llrint:
1289 case Intrinsic::lround:
1290 case Intrinsic::llround: {
1291 auto LT = getTypeLegalizationCost(RetTy);
1292 Type *SrcTy = ICA.getArgTypes().front();
1293 auto SrcLT = getTypeLegalizationCost(SrcTy);
1294 if (ST->hasVInstructions() && LT.second.isVector()) {
1296 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1297 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1298 if (LT.second.getVectorElementType() == MVT::bf16) {
1299 if (!ST->hasVInstructionsBF16Minimal())
1301 if (DstEltSz == 32)
1302 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1303 else
1304 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1305 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1306 !ST->hasVInstructionsF16()) {
1307 if (!ST->hasVInstructionsF16Minimal())
1309 if (DstEltSz == 32)
1310 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1311 else
1312 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1313
1314 } else if (SrcEltSz > DstEltSz) {
1315 Ops = {RISCV::VFNCVT_X_F_W};
1316 } else if (SrcEltSz < DstEltSz) {
1317 Ops = {RISCV::VFWCVT_X_F_V};
1318 } else {
1319 Ops = {RISCV::VFCVT_X_F_V};
1320 }
1321
1322 // We need to use the source LMUL in the case of a narrowing op, and the
1323 // destination LMUL otherwise.
1324 if (SrcEltSz > DstEltSz)
1325 return SrcLT.first *
1326 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1327 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1328 }
1329 break;
1330 }
1331 case Intrinsic::ceil:
1332 case Intrinsic::floor:
1333 case Intrinsic::trunc:
1334 case Intrinsic::rint:
1335 case Intrinsic::round:
1336 case Intrinsic::roundeven: {
1337 // These all use the same code.
1338 auto LT = getTypeLegalizationCost(RetTy);
1339 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1340 return LT.first * 8;
1341 break;
1342 }
1343 case Intrinsic::umin:
1344 case Intrinsic::umax:
1345 case Intrinsic::smin:
1346 case Intrinsic::smax: {
1347 auto LT = getTypeLegalizationCost(RetTy);
1348 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1349 return LT.first;
1350
1351 if (ST->hasVInstructions() && LT.second.isVector()) {
1352 unsigned Op;
1353 switch (ICA.getID()) {
1354 case Intrinsic::umin:
1355 Op = RISCV::VMINU_VV;
1356 break;
1357 case Intrinsic::umax:
1358 Op = RISCV::VMAXU_VV;
1359 break;
1360 case Intrinsic::smin:
1361 Op = RISCV::VMIN_VV;
1362 break;
1363 case Intrinsic::smax:
1364 Op = RISCV::VMAX_VV;
1365 break;
1366 }
1367 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1368 }
1369 break;
1370 }
1371 case Intrinsic::sadd_sat:
1372 case Intrinsic::ssub_sat:
1373 case Intrinsic::uadd_sat:
1374 case Intrinsic::usub_sat: {
1375 auto LT = getTypeLegalizationCost(RetTy);
1376 if (ST->hasVInstructions() && LT.second.isVector()) {
1377 unsigned Op;
1378 switch (ICA.getID()) {
1379 case Intrinsic::sadd_sat:
1380 Op = RISCV::VSADD_VV;
1381 break;
1382 case Intrinsic::ssub_sat:
1383 Op = RISCV::VSSUBU_VV;
1384 break;
1385 case Intrinsic::uadd_sat:
1386 Op = RISCV::VSADDU_VV;
1387 break;
1388 case Intrinsic::usub_sat:
1389 Op = RISCV::VSSUBU_VV;
1390 break;
1391 }
1392 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1393 }
1394 break;
1395 }
1396 case Intrinsic::fma:
1397 case Intrinsic::fmuladd: {
1398 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1399 auto LT = getTypeLegalizationCost(RetTy);
1400 if (ST->hasVInstructions() && LT.second.isVector())
1401 return LT.first *
1402 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1403 break;
1404 }
1405 case Intrinsic::fabs: {
1406 auto LT = getTypeLegalizationCost(RetTy);
1407 if (ST->hasVInstructions() && LT.second.isVector()) {
1408 // lui a0, 8
1409 // addi a0, a0, -1
1410 // vsetvli a1, zero, e16, m1, ta, ma
1411 // vand.vx v8, v8, a0
1412 // f16 with zvfhmin and bf16 with zvfhbmin
1413 if (LT.second.getVectorElementType() == MVT::bf16 ||
1414 (LT.second.getVectorElementType() == MVT::f16 &&
1415 !ST->hasVInstructionsF16()))
1416 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1417 CostKind) +
1418 2;
1419 else
1420 return LT.first *
1421 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1422 }
1423 break;
1424 }
1425 case Intrinsic::sqrt: {
1426 auto LT = getTypeLegalizationCost(RetTy);
1427 if (ST->hasVInstructions() && LT.second.isVector()) {
1430 MVT ConvType = LT.second;
1431 MVT FsqrtType = LT.second;
1432 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1433 // will be spilt.
1434 if (LT.second.getVectorElementType() == MVT::bf16) {
1435 if (LT.second == MVT::nxv32bf16) {
1436 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1437 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1438 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1439 ConvType = MVT::nxv16f16;
1440 FsqrtType = MVT::nxv16f32;
1441 } else {
1442 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1443 FsqrtOp = {RISCV::VFSQRT_V};
1444 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1445 }
1446 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1447 !ST->hasVInstructionsF16()) {
1448 if (LT.second == MVT::nxv32f16) {
1449 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1450 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1451 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1452 ConvType = MVT::nxv16f16;
1453 FsqrtType = MVT::nxv16f32;
1454 } else {
1455 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1456 FsqrtOp = {RISCV::VFSQRT_V};
1457 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1458 }
1459 } else {
1460 FsqrtOp = {RISCV::VFSQRT_V};
1461 }
1462
1463 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1464 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1465 }
1466 break;
1467 }
1468 case Intrinsic::cttz:
1469 case Intrinsic::ctlz:
1470 case Intrinsic::ctpop: {
1471 auto LT = getTypeLegalizationCost(RetTy);
1472 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1473 unsigned Op;
1474 switch (ICA.getID()) {
1475 case Intrinsic::cttz:
1476 Op = RISCV::VCTZ_V;
1477 break;
1478 case Intrinsic::ctlz:
1479 Op = RISCV::VCLZ_V;
1480 break;
1481 case Intrinsic::ctpop:
1482 Op = RISCV::VCPOP_V;
1483 break;
1484 }
1485 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1486 }
1487 break;
1488 }
1489 case Intrinsic::abs: {
1490 auto LT = getTypeLegalizationCost(RetTy);
1491 if (ST->hasVInstructions() && LT.second.isVector()) {
1492 // vrsub.vi v10, v8, 0
1493 // vmax.vv v8, v8, v10
1494 return LT.first *
1495 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1496 LT.second, CostKind);
1497 }
1498 break;
1499 }
1500 case Intrinsic::get_active_lane_mask: {
1501 if (ST->hasVInstructions()) {
1502 Type *ExpRetTy = VectorType::get(
1503 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1504 auto LT = getTypeLegalizationCost(ExpRetTy);
1505
1506 // vid.v v8 // considered hoisted
1507 // vsaddu.vx v8, v8, a0
1508 // vmsltu.vx v0, v8, a1
1509 return LT.first *
1510 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1511 LT.second, CostKind);
1512 }
1513 break;
1514 }
1515 // TODO: add more intrinsic
1516 case Intrinsic::stepvector: {
1517 auto LT = getTypeLegalizationCost(RetTy);
1518 // Legalisation of illegal types involves an `index' instruction plus
1519 // (LT.first - 1) vector adds.
1520 if (ST->hasVInstructions())
1521 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1522 (LT.first - 1) *
1523 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1524 return 1 + (LT.first - 1);
1525 }
1526 case Intrinsic::experimental_cttz_elts: {
1527 Type *ArgTy = ICA.getArgTypes()[0];
1528 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1529 if (getTLI()->shouldExpandCttzElements(ArgType))
1530 break;
1531 InstructionCost Cost = getRISCVInstructionCost(
1532 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1533
1534 // If zero_is_poison is false, then we will generate additional
1535 // cmp + select instructions to convert -1 to EVL.
1536 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1537 if (ICA.getArgs().size() > 1 &&
1538 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1539 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1541 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1543
1544 return Cost;
1545 }
1546 case Intrinsic::experimental_vp_splat: {
1547 auto LT = getTypeLegalizationCost(RetTy);
1548 // TODO: Lower i1 experimental_vp_splat
1549 if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1)
1551 return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint()
1552 ? RISCV::VFMV_V_F
1553 : RISCV::VMV_V_X,
1554 LT.second, CostKind);
1555 }
1556 case Intrinsic::experimental_vp_splice: {
1557 // To support type-based query from vectorizer, set the index to 0.
1558 // Note that index only change the cost from vslide.vx to vslide.vi and in
1559 // current implementations they have same costs.
1561 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1563 }
1564 case Intrinsic::fptoui_sat:
1565 case Intrinsic::fptosi_sat: {
1567 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1568 Type *SrcTy = ICA.getArgTypes()[0];
1569
1570 auto SrcLT = getTypeLegalizationCost(SrcTy);
1571 auto DstLT = getTypeLegalizationCost(RetTy);
1572 if (!SrcTy->isVectorTy())
1573 break;
1574
1575 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1577
1578 Cost +=
1579 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1580 RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
1581
1582 // Handle NaN.
1583 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1584 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1585 Type *CondTy = RetTy->getWithNewBitWidth(1);
1586 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1588 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1590 return Cost;
1591 }
1592 }
1593
1594 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1595 if (auto LT = getTypeLegalizationCost(RetTy);
1596 LT.second.isVector()) {
1597 MVT EltTy = LT.second.getVectorElementType();
1598 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1599 ICA.getID(), EltTy))
1600 return LT.first * Entry->Cost;
1601 }
1602 }
1603
1605}
1606
1609 const SCEV *Ptr,
1611 // Address computations for vector indexed load/store likely require an offset
1612 // and/or scaling.
1613 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1614 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1615
1616 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1617}
1618
1620 Type *Src,
1623 const Instruction *I) const {
1624 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1625 if (!IsVectorType)
1626 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1627
1628 // FIXME: Need to compute legalizing cost for illegal types. The current
1629 // code handles only legal types and those which can be trivially
1630 // promoted to legal.
1631 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1632 Dst->getScalarSizeInBits() > ST->getELen())
1633 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1634
1635 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1636 assert(ISD && "Invalid opcode");
1637 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1638 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1639
1640 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1641 // The shared implementation doesn't model vector widening during legalization
1642 // and instead assumes scalarization. In order to scalarize an <N x i1>
1643 // vector, we need to extend/trunc to/from i8. If we don't special case
1644 // this, we can get an infinite recursion cycle.
1645 switch (ISD) {
1646 default:
1647 break;
1648 case ISD::SIGN_EXTEND:
1649 case ISD::ZERO_EXTEND:
1650 if (Src->getScalarSizeInBits() == 1) {
1651 // We do not use vsext/vzext to extend from mask vector.
1652 // Instead we use the following instructions to extend from mask vector:
1653 // vmv.v.i v8, 0
1654 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1655 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1656 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1657 DstLT.second, CostKind) +
1658 DstLT.first - 1;
1659 }
1660 break;
1661 case ISD::TRUNCATE:
1662 if (Dst->getScalarSizeInBits() == 1) {
1663 // We do not use several vncvt to truncate to mask vector. So we could
1664 // not use PowDiff to calculate it.
1665 // Instead we use the following instructions to truncate to mask vector:
1666 // vand.vi v8, v8, 1
1667 // vmsne.vi v0, v8, 0
1668 return SrcLT.first *
1669 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1670 SrcLT.second, CostKind) +
1671 SrcLT.first - 1;
1672 }
1673 break;
1674 };
1675
1676 // Our actual lowering for the case where a wider legal type is available
1677 // uses promotion to the wider type. This is reflected in the result of
1678 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1679 // scalarized if the legalized Src and Dst are not equal sized.
1680 const DataLayout &DL = this->getDataLayout();
1681 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1682 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1683 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1684 SrcLT.second.getSizeInBits()) ||
1685 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1686 DstLT.second.getSizeInBits()))
1687 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1688
1689 // The split cost is handled by the base getCastInstrCost
1690 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1691
1692 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1693 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1694 switch (ISD) {
1695 case ISD::SIGN_EXTEND:
1696 case ISD::ZERO_EXTEND: {
1697 if ((PowDiff < 1) || (PowDiff > 3))
1698 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1699 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1700 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1701 unsigned Op =
1702 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1703 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1704 }
1705 case ISD::TRUNCATE:
1706 case ISD::FP_EXTEND:
1707 case ISD::FP_ROUND: {
1708 // Counts of narrow/widen instructions.
1709 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1710 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1711
1712 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1713 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1714 : RISCV::VFNCVT_F_F_W;
1716 for (; SrcEltSize != DstEltSize;) {
1717 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1718 ? MVT::getIntegerVT(DstEltSize)
1719 : MVT::getFloatingPointVT(DstEltSize);
1720 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1721 DstEltSize =
1722 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1723 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1724 }
1725 return Cost;
1726 }
1727 case ISD::FP_TO_SINT:
1728 case ISD::FP_TO_UINT: {
1729 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1730 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1731 unsigned FWCVT =
1732 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1733 unsigned FNCVT =
1734 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1735 unsigned SrcEltSize = Src->getScalarSizeInBits();
1736 unsigned DstEltSize = Dst->getScalarSizeInBits();
1738 if ((SrcEltSize == 16) &&
1739 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1740 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1741 // pre-widening to f32 and then convert f32 to integer
1742 VectorType *VecF32Ty =
1743 VectorType::get(Type::getFloatTy(Dst->getContext()),
1744 cast<VectorType>(Dst)->getElementCount());
1745 std::pair<InstructionCost, MVT> VecF32LT =
1746 getTypeLegalizationCost(VecF32Ty);
1747 Cost +=
1748 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1749 VecF32LT.second, CostKind);
1750 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1751 return Cost;
1752 }
1753 if (DstEltSize == SrcEltSize)
1754 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1755 else if (DstEltSize > SrcEltSize)
1756 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1757 else { // (SrcEltSize > DstEltSize)
1758 // First do a narrowing conversion to an integer half the size, then
1759 // truncate if needed.
1760 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1761 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1762 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1763 if ((SrcEltSize / 2) > DstEltSize) {
1764 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1765 Cost +=
1766 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1767 }
1768 }
1769 return Cost;
1770 }
1771 case ISD::SINT_TO_FP:
1772 case ISD::UINT_TO_FP: {
1773 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1774 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1775 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1776 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1777 unsigned SrcEltSize = Src->getScalarSizeInBits();
1778 unsigned DstEltSize = Dst->getScalarSizeInBits();
1779
1781 if ((DstEltSize == 16) &&
1782 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1783 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1784 // it is converted to f32 and then converted to f16
1785 VectorType *VecF32Ty =
1786 VectorType::get(Type::getFloatTy(Dst->getContext()),
1787 cast<VectorType>(Dst)->getElementCount());
1788 std::pair<InstructionCost, MVT> VecF32LT =
1789 getTypeLegalizationCost(VecF32Ty);
1790 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1791 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1792 DstLT.second, CostKind);
1793 return Cost;
1794 }
1795
1796 if (DstEltSize == SrcEltSize)
1797 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1798 else if (DstEltSize > SrcEltSize) {
1799 if ((DstEltSize / 2) > SrcEltSize) {
1800 VectorType *VecTy =
1801 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1802 cast<VectorType>(Dst)->getElementCount());
1803 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1804 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1805 }
1806 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1807 } else
1808 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1809 return Cost;
1810 }
1811 }
1812 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1813}
1814
1815unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
1816 if (isa<ScalableVectorType>(Ty)) {
1817 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1818 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1819 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1820 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1821 }
1822 return cast<FixedVectorType>(Ty)->getNumElements();
1823}
1824
1827 FastMathFlags FMF,
1829 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1830 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1831
1832 // Skip if scalar size of Ty is bigger than ELEN.
1833 if (Ty->getScalarSizeInBits() > ST->getELen())
1834 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1835
1836 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1837 if (Ty->getElementType()->isIntegerTy(1)) {
1838 // SelectionDAGBuilder does following transforms:
1839 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1840 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1841 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1842 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1843 else
1844 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1845 }
1846
1847 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1849 InstructionCost ExtraCost = 0;
1850 switch (IID) {
1851 case Intrinsic::maximum:
1852 if (FMF.noNaNs()) {
1853 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1854 } else {
1855 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1856 RISCV::VFMV_F_S};
1857 // Cost of Canonical Nan + branch
1858 // lui a0, 523264
1859 // fmv.w.x fa0, a0
1860 Type *DstTy = Ty->getScalarType();
1861 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1862 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1863 ExtraCost = 1 +
1864 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1866 getCFInstrCost(Instruction::Br, CostKind);
1867 }
1868 break;
1869
1870 case Intrinsic::minimum:
1871 if (FMF.noNaNs()) {
1872 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1873 } else {
1874 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1875 RISCV::VFMV_F_S};
1876 // Cost of Canonical Nan + branch
1877 // lui a0, 523264
1878 // fmv.w.x fa0, a0
1879 Type *DstTy = Ty->getScalarType();
1880 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1881 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1882 ExtraCost = 1 +
1883 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1885 getCFInstrCost(Instruction::Br, CostKind);
1886 }
1887 break;
1888 }
1889 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1890 }
1891
1892 // IR Reduction is composed by one rvv reduction instruction and vmv
1893 unsigned SplitOp;
1895 switch (IID) {
1896 default:
1897 llvm_unreachable("Unsupported intrinsic");
1898 case Intrinsic::smax:
1899 SplitOp = RISCV::VMAX_VV;
1900 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1901 break;
1902 case Intrinsic::smin:
1903 SplitOp = RISCV::VMIN_VV;
1904 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1905 break;
1906 case Intrinsic::umax:
1907 SplitOp = RISCV::VMAXU_VV;
1908 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1909 break;
1910 case Intrinsic::umin:
1911 SplitOp = RISCV::VMINU_VV;
1912 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1913 break;
1914 case Intrinsic::maxnum:
1915 SplitOp = RISCV::VFMAX_VV;
1916 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1917 break;
1918 case Intrinsic::minnum:
1919 SplitOp = RISCV::VFMIN_VV;
1920 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1921 break;
1922 }
1923 // Add a cost for data larger than LMUL8
1924 InstructionCost SplitCost =
1925 (LT.first > 1) ? (LT.first - 1) *
1926 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1927 : 0;
1928 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1929}
1930
1933 std::optional<FastMathFlags> FMF,
1935 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1936 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1937
1938 // Skip if scalar size of Ty is bigger than ELEN.
1939 if (Ty->getScalarSizeInBits() > ST->getELen())
1940 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1941
1942 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1943 assert(ISD && "Invalid opcode");
1944
1945 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1946 ISD != ISD::FADD)
1947 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1948
1949 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1950 Type *ElementTy = Ty->getElementType();
1951 if (ElementTy->isIntegerTy(1)) {
1952 // Example sequences:
1953 // vfirst.m a0, v0
1954 // seqz a0, a0
1955 if (LT.second == MVT::v1i1)
1956 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
1957 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1959
1960 if (ISD == ISD::AND) {
1961 // Example sequences:
1962 // vmand.mm v8, v9, v8 ; needed every time type is split
1963 // vmnot.m v8, v0 ; alias for vmnand
1964 // vcpop.m a0, v8
1965 // seqz a0, a0
1966
1967 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
1968 // For LMUL <= 8, there is no splitting,
1969 // the sequences are vmnot, vcpop and seqz.
1970 // When LMUL > 8 and split = 1,
1971 // the sequences are vmnand, vcpop and seqz.
1972 // When LMUL > 8 and split > 1,
1973 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
1974 return ((LT.first > 2) ? (LT.first - 2) : 0) *
1975 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
1976 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
1977 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1978 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1980 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
1981 // Example sequences:
1982 // vsetvli a0, zero, e8, mf8, ta, ma
1983 // vmxor.mm v8, v0, v8 ; needed every time type is split
1984 // vcpop.m a0, v8
1985 // andi a0, a0, 1
1986 return (LT.first - 1) *
1987 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
1988 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
1989 } else {
1990 assert(ISD == ISD::OR);
1991 // Example sequences:
1992 // vsetvli a0, zero, e8, mf8, ta, ma
1993 // vmor.mm v8, v9, v8 ; needed every time type is split
1994 // vcpop.m a0, v0
1995 // snez a0, a0
1996 return (LT.first - 1) *
1997 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
1998 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1999 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2001 }
2002 }
2003
2004 // IR Reduction of or/and is composed by one vmv and one rvv reduction
2005 // instruction, and others is composed by two vmv and one rvv reduction
2006 // instruction
2007 unsigned SplitOp;
2009 switch (ISD) {
2010 case ISD::ADD:
2011 SplitOp = RISCV::VADD_VV;
2012 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
2013 break;
2014 case ISD::OR:
2015 SplitOp = RISCV::VOR_VV;
2016 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
2017 break;
2018 case ISD::XOR:
2019 SplitOp = RISCV::VXOR_VV;
2020 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
2021 break;
2022 case ISD::AND:
2023 SplitOp = RISCV::VAND_VV;
2024 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
2025 break;
2026 case ISD::FADD:
2027 // We can't promote f16/bf16 fadd reductions.
2028 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
2029 LT.second.getScalarType() == MVT::bf16)
2030 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2032 Opcodes.push_back(RISCV::VFMV_S_F);
2033 for (unsigned i = 0; i < LT.first.getValue(); i++)
2034 Opcodes.push_back(RISCV::VFREDOSUM_VS);
2035 Opcodes.push_back(RISCV::VFMV_F_S);
2036 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2037 }
2038 SplitOp = RISCV::VFADD_VV;
2039 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2040 break;
2041 }
2042 // Add a cost for data larger than LMUL8
2043 InstructionCost SplitCost =
2044 (LT.first > 1) ? (LT.first - 1) *
2045 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2046 : 0;
2047 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2048}
2049
2051 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2052 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2053 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2054 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2055 FMF, CostKind);
2056
2057 // Skip if scalar size of ResTy is bigger than ELEN.
2058 if (ResTy->getScalarSizeInBits() > ST->getELen())
2059 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2060 FMF, CostKind);
2061
2062 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2063 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2064 FMF, CostKind);
2065
2066 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2067
2068 if (IsUnsigned && Opcode == Instruction::Add &&
2069 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2070 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2071 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2072 return LT.first *
2073 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2074 }
2075
2076 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2077 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2078 FMF, CostKind);
2079
2080 return (LT.first - 1) +
2081 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2082}
2083
2087 assert(OpInfo.isConstant() && "non constant operand?");
2088 if (!isa<VectorType>(Ty))
2089 // FIXME: We need to account for immediate materialization here, but doing
2090 // a decent job requires more knowledge about the immediate than we
2091 // currently have here.
2092 return 0;
2093
2094 if (OpInfo.isUniform())
2095 // vmv.v.i, vmv.v.x, or vfmv.v.f
2096 // We ignore the cost of the scalar constant materialization to be consistent
2097 // with how we treat scalar constants themselves just above.
2098 return 1;
2099
2100 return getConstantPoolLoadCost(Ty, CostKind);
2101}
2102
2104 Align Alignment,
2105 unsigned AddressSpace,
2107 TTI::OperandValueInfo OpInfo,
2108 const Instruction *I) const {
2109 EVT VT = TLI->getValueType(DL, Src, true);
2110 // Type legalization can't handle structs
2111 if (VT == MVT::Other)
2112 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2113 CostKind, OpInfo, I);
2114
2116 if (Opcode == Instruction::Store && OpInfo.isConstant())
2117 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2118
2119 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2120
2121 InstructionCost BaseCost = [&]() {
2122 InstructionCost Cost = LT.first;
2124 return Cost;
2125
2126 // Our actual lowering for the case where a wider legal type is available
2127 // uses the a VL predicated load on the wider type. This is reflected in
2128 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2129 // widened cases are scalarized.
2130 const DataLayout &DL = this->getDataLayout();
2131 if (Src->isVectorTy() && LT.second.isVector() &&
2132 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
2133 LT.second.getSizeInBits()))
2134 return Cost;
2135
2136 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2137 CostKind, OpInfo, I);
2138 }();
2139
2140 // Assume memory ops cost scale with the number of vector registers
2141 // possible accessed by the instruction. Note that BasicTTI already
2142 // handles the LT.first term for us.
2143 if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
2144 BaseCost *= TLI->getLMULCost(LT.second);
2145 return Cost + BaseCost;
2146}
2147
2149 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2151 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2153 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2154 Op1Info, Op2Info, I);
2155
2156 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2157 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2158 Op1Info, Op2Info, I);
2159
2160 // Skip if scalar size of ValTy is bigger than ELEN.
2161 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2162 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2163 Op1Info, Op2Info, I);
2164
2165 auto GetConstantMatCost =
2166 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2167 if (OpInfo.isUniform())
2168 // We return 0 we currently ignore the cost of materializing scalar
2169 // constants in GPRs.
2170 return 0;
2171
2172 return getConstantPoolLoadCost(ValTy, CostKind);
2173 };
2174
2175 InstructionCost ConstantMatCost;
2176 if (Op1Info.isConstant())
2177 ConstantMatCost += GetConstantMatCost(Op1Info);
2178 if (Op2Info.isConstant())
2179 ConstantMatCost += GetConstantMatCost(Op2Info);
2180
2181 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2182 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2183 if (CondTy->isVectorTy()) {
2184 if (ValTy->getScalarSizeInBits() == 1) {
2185 // vmandn.mm v8, v8, v9
2186 // vmand.mm v9, v0, v9
2187 // vmor.mm v0, v9, v8
2188 return ConstantMatCost +
2189 LT.first *
2190 getRISCVInstructionCost(
2191 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2192 LT.second, CostKind);
2193 }
2194 // vselect and max/min are supported natively.
2195 return ConstantMatCost +
2196 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2197 CostKind);
2198 }
2199
2200 if (ValTy->getScalarSizeInBits() == 1) {
2201 // vmv.v.x v9, a0
2202 // vmsne.vi v9, v9, 0
2203 // vmandn.mm v8, v8, v9
2204 // vmand.mm v9, v0, v9
2205 // vmor.mm v0, v9, v8
2206 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2207 return ConstantMatCost +
2208 LT.first *
2209 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2210 InterimVT, CostKind) +
2211 LT.first * getRISCVInstructionCost(
2212 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2213 LT.second, CostKind);
2214 }
2215
2216 // vmv.v.x v10, a0
2217 // vmsne.vi v0, v10, 0
2218 // vmerge.vvm v8, v9, v8, v0
2219 return ConstantMatCost +
2220 LT.first * getRISCVInstructionCost(
2221 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2222 LT.second, CostKind);
2223 }
2224
2225 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2226 CmpInst::isIntPredicate(VecPred)) {
2227 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2228 // provided they incur the same cost across all implementations
2229 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2230 LT.second,
2231 CostKind);
2232 }
2233
2234 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2235 CmpInst::isFPPredicate(VecPred)) {
2236
2237 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2238 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2239 return ConstantMatCost +
2240 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2241
2242 // If we do not support the input floating point vector type, use the base
2243 // one which will calculate as:
2244 // ScalarizeCost + Num * Cost for fixed vector,
2245 // InvalidCost for scalable vector.
2246 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2247 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2248 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2249 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2250 Op1Info, Op2Info, I);
2251
2252 // Assuming vector fp compare and mask instructions are all the same cost
2253 // until a need arises to differentiate them.
2254 switch (VecPred) {
2255 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2256 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2257 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2258 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2259 return ConstantMatCost +
2260 LT.first * getRISCVInstructionCost(
2261 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2262 LT.second, CostKind);
2263
2264 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2265 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2266 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2267 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2268 return ConstantMatCost +
2269 LT.first *
2270 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2271 LT.second, CostKind);
2272
2273 case CmpInst::FCMP_OEQ: // vmfeq.vv
2274 case CmpInst::FCMP_OGT: // vmflt.vv
2275 case CmpInst::FCMP_OGE: // vmfle.vv
2276 case CmpInst::FCMP_OLT: // vmflt.vv
2277 case CmpInst::FCMP_OLE: // vmfle.vv
2278 case CmpInst::FCMP_UNE: // vmfne.vv
2279 return ConstantMatCost +
2280 LT.first *
2281 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2282 default:
2283 break;
2284 }
2285 }
2286
2287 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2288 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2289 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2290 // be (0 + select instr cost).
2291 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2292 ValTy->isIntegerTy() && !I->user_empty()) {
2293 if (all_of(I->users(), [&](const User *U) {
2294 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2295 U->getType()->isIntegerTy() &&
2296 !isa<ConstantData>(U->getOperand(1)) &&
2297 !isa<ConstantData>(U->getOperand(2));
2298 }))
2299 return 0;
2300 }
2301
2302 // TODO: Add cost for scalar type.
2303
2304 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2305 Op1Info, Op2Info, I);
2306}
2307
2310 const Instruction *I) const {
2312 return Opcode == Instruction::PHI ? 0 : 1;
2313 // Branches are assumed to be predicted.
2314 return 0;
2315}
2316
2319 unsigned Index,
2320 const Value *Op0,
2321 const Value *Op1) const {
2322 assert(Val->isVectorTy() && "This must be a vector type");
2323
2324 if (Opcode != Instruction::ExtractElement &&
2325 Opcode != Instruction::InsertElement)
2326 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
2327
2328 // Legalize the type.
2329 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2330
2331 // This type is legalized to a scalar type.
2332 if (!LT.second.isVector()) {
2333 auto *FixedVecTy = cast<FixedVectorType>(Val);
2334 // If Index is a known constant, cost is zero.
2335 if (Index != -1U)
2336 return 0;
2337 // Extract/InsertElement with non-constant index is very costly when
2338 // scalarized; estimate cost of loads/stores sequence via the stack:
2339 // ExtractElement cost: store vector to stack, load scalar;
2340 // InsertElement cost: store vector to stack, store scalar, load vector.
2341 Type *ElemTy = FixedVecTy->getElementType();
2342 auto NumElems = FixedVecTy->getNumElements();
2343 auto Align = DL.getPrefTypeAlign(ElemTy);
2344 InstructionCost LoadCost =
2345 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2346 InstructionCost StoreCost =
2347 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2348 return Opcode == Instruction::ExtractElement
2349 ? StoreCost * NumElems + LoadCost
2350 : (StoreCost + LoadCost) * NumElems + StoreCost;
2351 }
2352
2353 // For unsupported scalable vector.
2354 if (LT.second.isScalableVector() && !LT.first.isValid())
2355 return LT.first;
2356
2357 // Mask vector extract/insert is expanded via e8.
2358 if (Val->getScalarSizeInBits() == 1) {
2359 VectorType *WideTy =
2361 cast<VectorType>(Val)->getElementCount());
2362 if (Opcode == Instruction::ExtractElement) {
2363 InstructionCost ExtendCost
2364 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2366 InstructionCost ExtractCost
2367 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2368 return ExtendCost + ExtractCost;
2369 }
2370 InstructionCost ExtendCost
2371 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2373 InstructionCost InsertCost
2374 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2375 InstructionCost TruncCost
2376 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2378 return ExtendCost + InsertCost + TruncCost;
2379 }
2380
2381
2382 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2383 // and vslideup + vmv.s.x to insert element to vector.
2384 unsigned BaseCost = 1;
2385 // When insertelement we should add the index with 1 as the input of vslideup.
2386 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2387
2388 if (Index != -1U) {
2389 // The type may be split. For fixed-width vectors we can normalize the
2390 // index to the new type.
2391 if (LT.second.isFixedLengthVector()) {
2392 unsigned Width = LT.second.getVectorNumElements();
2393 Index = Index % Width;
2394 }
2395
2396 // If exact VLEN is known, we will insert/extract into the appropriate
2397 // subvector with no additional subvector insert/extract cost.
2398 if (auto VLEN = ST->getRealVLen()) {
2399 unsigned EltSize = LT.second.getScalarSizeInBits();
2400 unsigned M1Max = *VLEN / EltSize;
2401 Index = Index % M1Max;
2402 }
2403
2404 if (Index == 0)
2405 // We can extract/insert the first element without vslidedown/vslideup.
2406 SlideCost = 0;
2407 else if (ST->hasVendorXRivosVisni() && isUInt<5>(Index) &&
2408 Val->getScalarType()->isIntegerTy())
2409 SlideCost = 0; // With ri.vinsert/ri.vextract there is no slide needed
2410 else if (Opcode == Instruction::InsertElement)
2411 SlideCost = 1; // With a constant index, we do not need to use addi.
2412 }
2413
2414 // When the vector needs to split into multiple register groups and the index
2415 // exceeds single vector register group, we need to insert/extract the element
2416 // via stack.
2417 if (LT.first > 1 &&
2418 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2419 LT.second.isScalableVector()))) {
2420 Type *ScalarType = Val->getScalarType();
2421 Align VecAlign = DL.getPrefTypeAlign(Val);
2422 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2423 // Extra addi for unknown index.
2424 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2425
2426 // Store all split vectors into stack and load the target element.
2427 if (Opcode == Instruction::ExtractElement)
2428 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2429 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2430 CostKind) +
2431 IdxCost;
2432
2433 // Store all split vectors into stack and store the target element and load
2434 // vectors back.
2435 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2436 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2437 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2438 CostKind) +
2439 IdxCost;
2440 }
2441
2442 // Extract i64 in the target that has XLEN=32 need more instruction.
2443 if (Val->getScalarType()->isIntegerTy() &&
2444 ST->getXLen() < Val->getScalarSizeInBits()) {
2445 // For extractelement, we need the following instructions:
2446 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2447 // vslidedown.vx v8, v8, a0
2448 // vmv.x.s a0, v8
2449 // li a1, 32
2450 // vsrl.vx v8, v8, a1
2451 // vmv.x.s a1, v8
2452
2453 // For insertelement, we need the following instructions:
2454 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2455 // vmv.v.i v12, 0
2456 // vslide1up.vx v16, v12, a1
2457 // vslide1up.vx v12, v16, a0
2458 // addi a0, a2, 1
2459 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2460 // vslideup.vx v8, v12, a2
2461
2462 // TODO: should we count these special vsetvlis?
2463 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2464 }
2465 return BaseCost + SlideCost;
2466}
2467
2471 unsigned Index) const {
2472 if (isa<FixedVectorType>(Val))
2474 Index);
2475
2476 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2477 // for the cost of extracting the last lane of a scalable vector. It probably
2478 // needs a more accurate cost.
2479 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2480 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2481 return getVectorInstrCost(Opcode, Val, CostKind,
2482 EC.getKnownMinValue() - 1 - Index, nullptr,
2483 nullptr);
2484}
2485
2487 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2489 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2490
2491 // TODO: Handle more cost kinds.
2493 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2494 Args, CxtI);
2495
2496 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2497 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2498 Args, CxtI);
2499
2500 // Skip if scalar size of Ty is bigger than ELEN.
2501 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2502 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2503 Args, CxtI);
2504
2505 // Legalize the type.
2506 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2507
2508 // TODO: Handle scalar type.
2509 if (!LT.second.isVector())
2510 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2511 Args, CxtI);
2512
2513 // f16 with zvfhmin and bf16 will be promoted to f32.
2514 // FIXME: nxv32[b]f16 will be custom lowered and split.
2515 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2516 InstructionCost CastCost = 0;
2517 if ((LT.second.getVectorElementType() == MVT::f16 ||
2518 LT.second.getVectorElementType() == MVT::bf16) &&
2519 TLI->getOperationAction(ISDOpcode, LT.second) ==
2521 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2522 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2523 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2524 // Add cost of extending arguments
2525 CastCost += LT.first * Args.size() *
2526 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2528 // Add cost of truncating result
2529 CastCost +=
2530 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2532 // Compute cost of op in promoted type
2533 LT.second = PromotedVT;
2534 }
2535
2536 auto getConstantMatCost =
2537 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2538 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2539 // Two sub-cases:
2540 // * Has a 5 bit immediate operand which can be splatted.
2541 // * Has a larger immediate which must be materialized in scalar register
2542 // We return 0 for both as we currently ignore the cost of materializing
2543 // scalar constants in GPRs.
2544 return 0;
2545
2546 return getConstantPoolLoadCost(Ty, CostKind);
2547 };
2548
2549 // Add the cost of materializing any constant vectors required.
2550 InstructionCost ConstantMatCost = 0;
2551 if (Op1Info.isConstant())
2552 ConstantMatCost += getConstantMatCost(0, Op1Info);
2553 if (Op2Info.isConstant())
2554 ConstantMatCost += getConstantMatCost(1, Op2Info);
2555
2556 unsigned Op;
2557 switch (ISDOpcode) {
2558 case ISD::ADD:
2559 case ISD::SUB:
2560 Op = RISCV::VADD_VV;
2561 break;
2562 case ISD::SHL:
2563 case ISD::SRL:
2564 case ISD::SRA:
2565 Op = RISCV::VSLL_VV;
2566 break;
2567 case ISD::AND:
2568 case ISD::OR:
2569 case ISD::XOR:
2570 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2571 break;
2572 case ISD::MUL:
2573 case ISD::MULHS:
2574 case ISD::MULHU:
2575 Op = RISCV::VMUL_VV;
2576 break;
2577 case ISD::SDIV:
2578 case ISD::UDIV:
2579 Op = RISCV::VDIV_VV;
2580 break;
2581 case ISD::SREM:
2582 case ISD::UREM:
2583 Op = RISCV::VREM_VV;
2584 break;
2585 case ISD::FADD:
2586 case ISD::FSUB:
2587 Op = RISCV::VFADD_VV;
2588 break;
2589 case ISD::FMUL:
2590 Op = RISCV::VFMUL_VV;
2591 break;
2592 case ISD::FDIV:
2593 Op = RISCV::VFDIV_VV;
2594 break;
2595 case ISD::FNEG:
2596 Op = RISCV::VFSGNJN_VV;
2597 break;
2598 default:
2599 // Assuming all other instructions have the same cost until a need arises to
2600 // differentiate them.
2601 return CastCost + ConstantMatCost +
2602 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2603 Args, CxtI);
2604 }
2605
2606 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2607 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2608 // ops are twice as expensive as integer ops. Do the same for vectors so
2609 // scalar floating point ops aren't cheaper than their vector equivalents.
2610 if (Ty->isFPOrFPVectorTy())
2611 InstrCost *= 2;
2612 return CastCost + ConstantMatCost + LT.first * InstrCost;
2613}
2614
2615// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2617 ArrayRef<const Value *> Ptrs, const Value *Base,
2618 const TTI::PointersChainInfo &Info, Type *AccessTy,
2621 // In the basic model we take into account GEP instructions only
2622 // (although here can come alloca instruction, a value, constants and/or
2623 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2624 // pointer). Typically, if Base is a not a GEP-instruction and all the
2625 // pointers are relative to the same base address, all the rest are
2626 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2627 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2628 // any their index is a non-const.
2629 // If no known dependencies between the pointers cost is calculated as a sum
2630 // of costs of GEP instructions.
2631 for (auto [I, V] : enumerate(Ptrs)) {
2632 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2633 if (!GEP)
2634 continue;
2635 if (Info.isSameBase() && V != Base) {
2636 if (GEP->hasAllConstantIndices())
2637 continue;
2638 // If the chain is unit-stride and BaseReg + stride*i is a legal
2639 // addressing mode, then presume the base GEP is sitting around in a
2640 // register somewhere and check if we can fold the offset relative to
2641 // it.
2642 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2643 if (Info.isUnitStride() &&
2644 isLegalAddressingMode(AccessTy,
2645 /* BaseGV */ nullptr,
2646 /* BaseOffset */ Stride * I,
2647 /* HasBaseReg */ true,
2648 /* Scale */ 0,
2649 GEP->getType()->getPointerAddressSpace()))
2650 continue;
2651 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2652 {TTI::OK_AnyValue, TTI::OP_None},
2653 {TTI::OK_AnyValue, TTI::OP_None}, {});
2654 } else {
2655 SmallVector<const Value *> Indices(GEP->indices());
2656 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2657 Indices, AccessTy, CostKind);
2658 }
2659 }
2660 return Cost;
2661}
2662
2665 OptimizationRemarkEmitter *ORE) const {
2666 // TODO: More tuning on benchmarks and metrics with changes as needed
2667 // would apply to all settings below to enable performance.
2668
2669
2670 if (ST->enableDefaultUnroll())
2671 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2672
2673 // Enable Upper bound unrolling universally, not dependent upon the conditions
2674 // below.
2675 UP.UpperBound = true;
2676
2677 // Disable loop unrolling for Oz and Os.
2678 UP.OptSizeThreshold = 0;
2680 if (L->getHeader()->getParent()->hasOptSize())
2681 return;
2682
2683 SmallVector<BasicBlock *, 4> ExitingBlocks;
2684 L->getExitingBlocks(ExitingBlocks);
2685 LLVM_DEBUG(dbgs() << "Loop has:\n"
2686 << "Blocks: " << L->getNumBlocks() << "\n"
2687 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2688
2689 // Only allow another exit other than the latch. This acts as an early exit
2690 // as it mirrors the profitability calculation of the runtime unroller.
2691 if (ExitingBlocks.size() > 2)
2692 return;
2693
2694 // Limit the CFG of the loop body for targets with a branch predictor.
2695 // Allowing 4 blocks permits if-then-else diamonds in the body.
2696 if (L->getNumBlocks() > 4)
2697 return;
2698
2699 // Scan the loop: don't unroll loops with calls as this could prevent
2700 // inlining. Don't unroll auto-vectorized loops either, though do allow
2701 // unrolling of the scalar remainder.
2702 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2704 for (auto *BB : L->getBlocks()) {
2705 for (auto &I : *BB) {
2706 // Both auto-vectorized loops and the scalar remainder have the
2707 // isvectorized attribute, so differentiate between them by the presence
2708 // of vector instructions.
2709 if (IsVectorized && I.getType()->isVectorTy())
2710 return;
2711
2712 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2713 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2714 if (!isLoweredToCall(F))
2715 continue;
2716 }
2717 return;
2718 }
2719
2720 SmallVector<const Value *> Operands(I.operand_values());
2723 }
2724 }
2725
2726 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2727
2728 UP.Partial = true;
2729 UP.Runtime = true;
2730 UP.UnrollRemainder = true;
2731 UP.UnrollAndJam = true;
2732
2733 // Force unrolling small loops can be very useful because of the branch
2734 // taken cost of the backedge.
2735 if (Cost < 12)
2736 UP.Force = true;
2737}
2738
2743
2745 MemIntrinsicInfo &Info) const {
2746 const DataLayout &DL = getDataLayout();
2747 Intrinsic::ID IID = Inst->getIntrinsicID();
2748 LLVMContext &C = Inst->getContext();
2749 bool HasMask = false;
2750 switch (IID) {
2751 case Intrinsic::riscv_vle_mask:
2752 case Intrinsic::riscv_vse_mask:
2753 HasMask = true;
2754 [[fallthrough]];
2755 case Intrinsic::riscv_vle:
2756 case Intrinsic::riscv_vse: {
2757 // Intrinsic interface:
2758 // riscv_vle(merge, ptr, vl)
2759 // riscv_vle_mask(merge, ptr, mask, vl, policy)
2760 // riscv_vse(val, ptr, vl)
2761 // riscv_vse_mask(val, ptr, mask, vl, policy)
2762 bool IsWrite = Inst->getType()->isVoidTy();
2763 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2764 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2765 unsigned VLIndex = RVVIInfo->VLOperand;
2766 unsigned PtrOperandNo = VLIndex - 1 - HasMask;
2767 MaybeAlign Alignment =
2768 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
2769 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
2770 Value *Mask = ConstantInt::getTrue(MaskType);
2771 if (HasMask)
2772 Mask = Inst->getArgOperand(VLIndex - 1);
2773 Value *EVL = Inst->getArgOperand(VLIndex);
2774 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
2775 Alignment, Mask, EVL);
2776 return true;
2777 }
2778 case Intrinsic::riscv_vlse_mask:
2779 case Intrinsic::riscv_vsse_mask:
2780 HasMask = true;
2781 [[fallthrough]];
2782 case Intrinsic::riscv_vlse:
2783 case Intrinsic::riscv_vsse: {
2784 // Intrinsic interface:
2785 // riscv_vlse(merge, ptr, stride, vl)
2786 // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
2787 // riscv_vsse(val, ptr, stride, vl)
2788 // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
2789 bool IsWrite = Inst->getType()->isVoidTy();
2790 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2791 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2792 unsigned VLIndex = RVVIInfo->VLOperand;
2793 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
2794 MaybeAlign Alignment =
2795 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
2796
2797 Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);
2798 // Use the pointer alignment as the element alignment if the stride is a
2799 // multiple of the pointer alignment. Otherwise, the element alignment
2800 // should be the greatest common divisor of pointer alignment and stride.
2801 // For simplicity, just consider unalignment for elements.
2802 unsigned PointerAlign = Alignment.valueOrOne().value();
2803 if (!isa<ConstantInt>(Stride) ||
2804 cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
2805 Alignment = Align(1);
2806
2807 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
2808 Value *Mask = ConstantInt::getTrue(MaskType);
2809 if (HasMask)
2810 Mask = Inst->getArgOperand(VLIndex - 1);
2811 Value *EVL = Inst->getArgOperand(VLIndex);
2812 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
2813 Alignment, Mask, EVL, Stride);
2814 return true;
2815 }
2816 case Intrinsic::riscv_vloxei_mask:
2817 case Intrinsic::riscv_vluxei_mask:
2818 case Intrinsic::riscv_vsoxei_mask:
2819 case Intrinsic::riscv_vsuxei_mask:
2820 HasMask = true;
2821 [[fallthrough]];
2822 case Intrinsic::riscv_vloxei:
2823 case Intrinsic::riscv_vluxei:
2824 case Intrinsic::riscv_vsoxei:
2825 case Intrinsic::riscv_vsuxei: {
2826 // Intrinsic interface (only listed ordered version):
2827 // riscv_vloxei(merge, ptr, index, vl)
2828 // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)
2829 // riscv_vsoxei(val, ptr, index, vl)
2830 // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)
2831 bool IsWrite = Inst->getType()->isVoidTy();
2832 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2833 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2834 unsigned VLIndex = RVVIInfo->VLOperand;
2835 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
2836 Value *Mask;
2837 if (HasMask) {
2838 Mask = Inst->getArgOperand(VLIndex - 1);
2839 } else {
2840 // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,
2841 // and casting that to scalar i64 triggers a vector/scalar mismatch
2842 // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it
2843 // via extractelement instead.
2844 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
2845 Mask = ConstantInt::getTrue(MaskType);
2846 }
2847 Value *EVL = Inst->getArgOperand(VLIndex);
2848 Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);
2849 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
2850 Align(1), Mask, EVL,
2851 /* Stride */ nullptr, OffsetOp);
2852 return true;
2853 }
2854 }
2855 return false;
2856}
2857
2859 if (Ty->isVectorTy()) {
2860 // f16 with only zvfhmin and bf16 will be promoted to f32
2861 Type *EltTy = cast<VectorType>(Ty)->getElementType();
2862 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
2863 EltTy->isBFloatTy())
2864 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
2865 cast<VectorType>(Ty));
2866
2867 TypeSize Size = DL.getTypeSizeInBits(Ty);
2868 if (Size.isScalable() && ST->hasVInstructions())
2869 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
2870
2871 if (ST->useRVVForFixedLengthVectors())
2872 return divideCeil(Size, ST->getRealMinVLen());
2873 }
2874
2875 return BaseT::getRegUsageForType(Ty);
2876}
2877
2878unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2879 if (SLPMaxVF.getNumOccurrences())
2880 return SLPMaxVF;
2881
2882 // Return how many elements can fit in getRegisterBitwidth. This is the
2883 // same routine as used in LoopVectorizer. We should probably be
2884 // accounting for whether we actually have instructions with the right
2885 // lane type, but we don't have enough information to do that without
2886 // some additional plumbing which hasn't been justified yet.
2887 TypeSize RegWidth =
2889 // If no vector registers, or absurd element widths, disable
2890 // vectorization by returning 1.
2891 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
2892}
2893
2897
2899 return ST->enableUnalignedVectorMem();
2900}
2901
2904 ScalarEvolution *SE) const {
2905 if (ST->hasVendorXCVmem() && !ST->is64Bit())
2906 return TTI::AMK_PostIndexed;
2907
2909}
2910
2912 const TargetTransformInfo::LSRCost &C2) const {
2913 // RISC-V specific here are "instruction number 1st priority".
2914 // If we need to emit adds inside the loop to add up base registers, then
2915 // we need at least one extra temporary register.
2916 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
2917 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
2918 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
2919 C1.NumIVMuls, C1.NumBaseAdds,
2920 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
2921 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
2922 C2.NumIVMuls, C2.NumBaseAdds,
2923 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
2924}
2925
2927 Align Alignment) const {
2928 auto *VTy = dyn_cast<VectorType>(DataTy);
2929 if (!VTy || VTy->isScalableTy())
2930 return false;
2931
2932 if (!isLegalMaskedLoadStore(DataTy, Alignment))
2933 return false;
2934
2935 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
2936 // scalarize these types with LMUL >= maximum fixed-length LMUL.
2937 if (VTy->getElementType()->isIntegerTy(8))
2938 if (VTy->getElementCount().getFixedValue() > 256)
2939 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
2940 ST->getMaxLMULForFixedLengthVectors();
2941 return true;
2942}
2943
2945 Align Alignment) const {
2946 auto *VTy = dyn_cast<VectorType>(DataTy);
2947 if (!VTy || VTy->isScalableTy())
2948 return false;
2949
2950 if (!isLegalMaskedLoadStore(DataTy, Alignment))
2951 return false;
2952 return true;
2953}
2954
2955/// See if \p I should be considered for address type promotion. We check if \p
2956/// I is a sext with right type and used in memory accesses. If it used in a
2957/// "complex" getelementptr, we allow it to be promoted without finding other
2958/// sext instructions that sign extended the same initial value. A getelementptr
2959/// is considered as "complex" if it has more than 2 operands.
2961 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
2962 bool Considerable = false;
2963 AllowPromotionWithoutCommonHeader = false;
2964 if (!isa<SExtInst>(&I))
2965 return false;
2966 Type *ConsideredSExtType =
2967 Type::getInt64Ty(I.getParent()->getParent()->getContext());
2968 if (I.getType() != ConsideredSExtType)
2969 return false;
2970 // See if the sext is the one with the right type and used in at least one
2971 // GetElementPtrInst.
2972 for (const User *U : I.users()) {
2973 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2974 Considerable = true;
2975 // A getelementptr is considered as "complex" if it has more than 2
2976 // operands. We will promote a SExt used in such complex GEP as we
2977 // expect some computation to be merged if they are done on 64 bits.
2978 if (GEPInst->getNumOperands() > 2) {
2979 AllowPromotionWithoutCommonHeader = true;
2980 break;
2981 }
2982 }
2983 }
2984 return Considerable;
2985}
2986
2987bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
2988 switch (Opcode) {
2989 case Instruction::Add:
2990 case Instruction::Sub:
2991 case Instruction::Mul:
2992 case Instruction::And:
2993 case Instruction::Or:
2994 case Instruction::Xor:
2995 case Instruction::FAdd:
2996 case Instruction::FSub:
2997 case Instruction::FMul:
2998 case Instruction::FDiv:
2999 case Instruction::ICmp:
3000 case Instruction::FCmp:
3001 return true;
3002 case Instruction::Shl:
3003 case Instruction::LShr:
3004 case Instruction::AShr:
3005 case Instruction::UDiv:
3006 case Instruction::SDiv:
3007 case Instruction::URem:
3008 case Instruction::SRem:
3009 case Instruction::Select:
3010 return Operand == 1;
3011 default:
3012 return false;
3013 }
3014}
3015
3017 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3018 return false;
3019
3020 if (canSplatOperand(I->getOpcode(), Operand))
3021 return true;
3022
3023 auto *II = dyn_cast<IntrinsicInst>(I);
3024 if (!II)
3025 return false;
3026
3027 switch (II->getIntrinsicID()) {
3028 case Intrinsic::fma:
3029 case Intrinsic::vp_fma:
3030 case Intrinsic::fmuladd:
3031 case Intrinsic::vp_fmuladd:
3032 return Operand == 0 || Operand == 1;
3033 case Intrinsic::vp_shl:
3034 case Intrinsic::vp_lshr:
3035 case Intrinsic::vp_ashr:
3036 case Intrinsic::vp_udiv:
3037 case Intrinsic::vp_sdiv:
3038 case Intrinsic::vp_urem:
3039 case Intrinsic::vp_srem:
3040 case Intrinsic::ssub_sat:
3041 case Intrinsic::vp_ssub_sat:
3042 case Intrinsic::usub_sat:
3043 case Intrinsic::vp_usub_sat:
3044 case Intrinsic::vp_select:
3045 return Operand == 1;
3046 // These intrinsics are commutative.
3047 case Intrinsic::vp_add:
3048 case Intrinsic::vp_mul:
3049 case Intrinsic::vp_and:
3050 case Intrinsic::vp_or:
3051 case Intrinsic::vp_xor:
3052 case Intrinsic::vp_fadd:
3053 case Intrinsic::vp_fmul:
3054 case Intrinsic::vp_icmp:
3055 case Intrinsic::vp_fcmp:
3056 case Intrinsic::smin:
3057 case Intrinsic::vp_smin:
3058 case Intrinsic::umin:
3059 case Intrinsic::vp_umin:
3060 case Intrinsic::smax:
3061 case Intrinsic::vp_smax:
3062 case Intrinsic::umax:
3063 case Intrinsic::vp_umax:
3064 case Intrinsic::sadd_sat:
3065 case Intrinsic::vp_sadd_sat:
3066 case Intrinsic::uadd_sat:
3067 case Intrinsic::vp_uadd_sat:
3068 // These intrinsics have 'vr' versions.
3069 case Intrinsic::vp_sub:
3070 case Intrinsic::vp_fsub:
3071 case Intrinsic::vp_fdiv:
3072 return Operand == 0 || Operand == 1;
3073 default:
3074 return false;
3075 }
3076}
3077
3078/// Check if sinking \p I's operands to I's basic block is profitable, because
3079/// the operands can be folded into a target instruction, e.g.
3080/// splats of scalars can fold into vector instructions.
3083 using namespace llvm::PatternMatch;
3084
3085 if (I->isBitwiseLogicOp()) {
3086 if (!I->getType()->isVectorTy()) {
3087 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
3088 for (auto &Op : I->operands()) {
3089 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
3090 if (match(Op.get(), m_Not(m_Value()))) {
3091 Ops.push_back(&Op);
3092 return true;
3093 }
3094 }
3095 }
3096 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
3097 for (auto &Op : I->operands()) {
3098 // (and X, (not Y)) -> (vandn.vv X, Y)
3099 if (match(Op.get(), m_Not(m_Value()))) {
3100 Ops.push_back(&Op);
3101 return true;
3102 }
3103 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
3105 m_ZeroInt()),
3106 m_Value(), m_ZeroMask()))) {
3107 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
3108 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
3109 Ops.push_back(&Not);
3110 Ops.push_back(&InsertElt);
3111 Ops.push_back(&Op);
3112 return true;
3113 }
3114 }
3115 }
3116 }
3117
3118 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3119 return false;
3120
3121 // Don't sink splat operands if the target prefers it. Some targets requires
3122 // S2V transfer buffers and we can run out of them copying the same value
3123 // repeatedly.
3124 // FIXME: It could still be worth doing if it would improve vector register
3125 // pressure and prevent a vector spill.
3126 if (!ST->sinkSplatOperands())
3127 return false;
3128
3129 for (auto OpIdx : enumerate(I->operands())) {
3130 if (!canSplatOperand(I, OpIdx.index()))
3131 continue;
3132
3133 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
3134 // Make sure we are not already sinking this operand
3135 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
3136 continue;
3137
3138 // We are looking for a splat/vp.splat that can be sunk.
3140 m_Value(), m_Value(), m_Value()));
3141 if (!IsVPSplat &&
3143 m_Undef(), m_ZeroMask())))
3144 continue;
3145
3146 // Don't sink i1 splats.
3147 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
3148 continue;
3149
3150 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3151 // and vector registers
3152 for (Use &U : Op->uses()) {
3153 Instruction *Insn = cast<Instruction>(U.getUser());
3154 if (!canSplatOperand(Insn, U.getOperandNo()))
3155 return false;
3156 }
3157
3158 // Sink any fpexts since they might be used in a widening fp pattern.
3159 if (IsVPSplat) {
3160 if (isa<FPExtInst>(Op->getOperand(0)))
3161 Ops.push_back(&Op->getOperandUse(0));
3162 } else {
3163 Use *InsertEltUse = &Op->getOperandUse(0);
3164 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3165 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3166 Ops.push_back(&InsertElt->getOperandUse(1));
3167 Ops.push_back(InsertEltUse);
3168 }
3169 Ops.push_back(&OpIdx.value());
3170 }
3171 return true;
3172}
3173
3175RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3177 // TODO: Enable expansion when unaligned access is not supported after we fix
3178 // issues in ExpandMemcmp.
3179 if (!ST->enableUnalignedScalarMem())
3180 return Options;
3181
3182 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3183 return Options;
3184
3185 Options.AllowOverlappingLoads = true;
3186 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3187 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3188 if (ST->is64Bit()) {
3189 Options.LoadSizes = {8, 4, 2, 1};
3190 Options.AllowedTailExpansions = {3, 5, 6};
3191 } else {
3192 Options.LoadSizes = {4, 2, 1};
3193 Options.AllowedTailExpansions = {3};
3194 }
3195
3196 if (IsZeroCmp && ST->hasVInstructions()) {
3197 unsigned VLenB = ST->getRealMinVLen() / 8;
3198 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3199 // `VLenB * MaxLMUL` so that it fits in a single register group.
3200 unsigned MinSize = ST->getXLen() / 8 + 1;
3201 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3202 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3203 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3204 }
3205 return Options;
3206}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
bool isLegalAddImmediate(int64_t imm) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
Value * getArgOperand(unsigned i) const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:681
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:695
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:684
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:693
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:682
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:683
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:692
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:686
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:689
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:690
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:685
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:687
@ ICMP_NE
not equal
Definition InstrTypes.h:700
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:694
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:691
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:680
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:688
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:772
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:778
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool noNaNs() const
Definition FMF.h:65
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *Src, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
virtual const DataLayout & getDataLayout() const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:62
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:285
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:181
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
auto m_Undef()
Match an arbitrary undef constant.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:355
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
TargetTransformInfo TTI
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1815
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:583
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2068
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:299
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).