LLVM 22.0.0git
SystemZTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a TargetTransformInfo analysis pass specific to the
10// SystemZ target machine. It uses the target's detailed information to provide
11// more precise answers to certain TTI queries, while letting the target
12// independent and default TTI implementations handle the rest.
13//
14//===----------------------------------------------------------------------===//
15
23#include "llvm/IR/Intrinsics.h"
24#include "llvm/Support/Debug.h"
27
28using namespace llvm;
29
30#define DEBUG_TYPE "systemztti"
31
32//===----------------------------------------------------------------------===//
33//
34// SystemZ cost model.
35//
36//===----------------------------------------------------------------------===//
37
38static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) {
39 bool UsedAsMemCpySource = false;
40 for (const User *U : V->users())
41 if (const Instruction *User = dyn_cast<Instruction>(U)) {
42 if (isa<BitCastInst>(User) || isa<GetElementPtrInst>(User)) {
43 UsedAsMemCpySource |= isUsedAsMemCpySource(User, OtherUse);
44 continue;
45 }
46 if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(User)) {
47 if (Memcpy->getOperand(1) == V && !Memcpy->isVolatile()) {
48 UsedAsMemCpySource = true;
49 continue;
50 }
51 }
52 OtherUse = true;
53 }
54 return UsedAsMemCpySource;
55}
56
57static void countNumMemAccesses(const Value *Ptr, unsigned &NumStores,
58 unsigned &NumLoads, const Function *F) {
59 if (!isa<PointerType>(Ptr->getType()))
60 return;
61 for (const User *U : Ptr->users())
62 if (const Instruction *User = dyn_cast<Instruction>(U)) {
63 if (User->getParent()->getParent() == F) {
64 if (const auto *SI = dyn_cast<StoreInst>(User)) {
65 if (SI->getPointerOperand() == Ptr && !SI->isVolatile())
66 NumStores++;
67 } else if (const auto *LI = dyn_cast<LoadInst>(User)) {
68 if (LI->getPointerOperand() == Ptr && !LI->isVolatile())
69 NumLoads++;
70 } else if (const auto *GEP = dyn_cast<GetElementPtrInst>(User)) {
71 if (GEP->getPointerOperand() == Ptr)
72 countNumMemAccesses(GEP, NumStores, NumLoads, F);
73 }
74 }
75 }
76}
77
79 unsigned Bonus = 0;
80 const Function *Caller = CB->getParent()->getParent();
81 const Function *Callee = CB->getCalledFunction();
82 if (!Callee)
83 return 0;
84
85 // Increase the threshold if an incoming argument is used only as a memcpy
86 // source.
87 for (const Argument &Arg : Callee->args()) {
88 bool OtherUse = false;
89 if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse) {
90 Bonus = 1000;
91 break;
92 }
93 }
94
95 // Give bonus for globals used much in both caller and a relatively small
96 // callee.
97 unsigned InstrCount = 0;
99 for (auto &I : instructions(Callee)) {
100 if (++InstrCount == 200) {
101 Ptr2NumUses.clear();
102 break;
103 }
104 if (const auto *SI = dyn_cast<StoreInst>(&I)) {
105 if (!SI->isVolatile())
106 if (auto *GV = dyn_cast<GlobalVariable>(SI->getPointerOperand()))
107 Ptr2NumUses[GV]++;
108 } else if (const auto *LI = dyn_cast<LoadInst>(&I)) {
109 if (!LI->isVolatile())
110 if (auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand()))
111 Ptr2NumUses[GV]++;
112 } else if (const auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
113 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getPointerOperand())) {
114 unsigned NumStores = 0, NumLoads = 0;
115 countNumMemAccesses(GEP, NumStores, NumLoads, Callee);
116 Ptr2NumUses[GV] += NumLoads + NumStores;
117 }
118 }
119 }
120
121 for (auto [Ptr, NumCalleeUses] : Ptr2NumUses)
122 if (NumCalleeUses > 10) {
123 unsigned CallerStores = 0, CallerLoads = 0;
124 countNumMemAccesses(Ptr, CallerStores, CallerLoads, Caller);
125 if (CallerStores + CallerLoads > 10) {
126 Bonus = 1000;
127 break;
128 }
129 }
130
131 // Give bonus when Callee accesses an Alloca of Caller heavily.
132 unsigned NumStores = 0;
133 unsigned NumLoads = 0;
134 for (unsigned OpIdx = 0; OpIdx != Callee->arg_size(); ++OpIdx) {
135 Value *CallerArg = CB->getArgOperand(OpIdx);
136 Argument *CalleeArg = Callee->getArg(OpIdx);
137 if (isa<AllocaInst>(CallerArg))
138 countNumMemAccesses(CalleeArg, NumStores, NumLoads, Callee);
139 }
140 if (NumLoads > 10)
141 Bonus += NumLoads * 50;
142 if (NumStores > 10)
143 Bonus += NumStores * 50;
144 Bonus = std::min(Bonus, unsigned(1000));
145
146 LLVM_DEBUG(if (Bonus)
147 dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
148 return Bonus;
149}
150
154 assert(Ty->isIntegerTy());
155
156 unsigned BitSize = Ty->getPrimitiveSizeInBits();
157 // There is no cost model for constants with a bit size of 0. Return TCC_Free
158 // here, so that constant hoisting will ignore this constant.
159 if (BitSize == 0)
160 return TTI::TCC_Free;
161 // No cost model for operations on integers larger than 128 bit implemented yet.
162 if ((!ST->hasVector() && BitSize > 64) || BitSize > 128)
163 return TTI::TCC_Free;
164
165 if (Imm == 0)
166 return TTI::TCC_Free;
167
168 if (Imm.getBitWidth() <= 64) {
169 // Constants loaded via lgfi.
170 if (isInt<32>(Imm.getSExtValue()))
171 return TTI::TCC_Basic;
172 // Constants loaded via llilf.
173 if (isUInt<32>(Imm.getZExtValue()))
174 return TTI::TCC_Basic;
175 // Constants loaded via llihf:
176 if ((Imm.getZExtValue() & 0xffffffff) == 0)
177 return TTI::TCC_Basic;
178
179 return 2 * TTI::TCC_Basic;
180 }
181
182 // i128 immediates loads from Constant Pool
183 return 2 * TTI::TCC_Basic;
184}
185
187 const APInt &Imm, Type *Ty,
189 Instruction *Inst) const {
190 assert(Ty->isIntegerTy());
191
192 unsigned BitSize = Ty->getPrimitiveSizeInBits();
193 // There is no cost model for constants with a bit size of 0. Return TCC_Free
194 // here, so that constant hoisting will ignore this constant.
195 if (BitSize == 0)
196 return TTI::TCC_Free;
197 // No cost model for operations on integers larger than 64 bit implemented yet.
198 if (BitSize > 64)
199 return TTI::TCC_Free;
200
201 switch (Opcode) {
202 default:
203 return TTI::TCC_Free;
204 case Instruction::GetElementPtr:
205 // Always hoist the base address of a GetElementPtr. This prevents the
206 // creation of new constants for every base constant that gets constant
207 // folded with the offset.
208 if (Idx == 0)
209 return 2 * TTI::TCC_Basic;
210 return TTI::TCC_Free;
211 case Instruction::Store:
212 if (Idx == 0 && Imm.getBitWidth() <= 64) {
213 // Any 8-bit immediate store can by implemented via mvi.
214 if (BitSize == 8)
215 return TTI::TCC_Free;
216 // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
217 if (isInt<16>(Imm.getSExtValue()))
218 return TTI::TCC_Free;
219 }
220 break;
221 case Instruction::ICmp:
222 if (Idx == 1 && Imm.getBitWidth() <= 64) {
223 // Comparisons against signed 32-bit immediates implemented via cgfi.
224 if (isInt<32>(Imm.getSExtValue()))
225 return TTI::TCC_Free;
226 // Comparisons against unsigned 32-bit immediates implemented via clgfi.
227 if (isUInt<32>(Imm.getZExtValue()))
228 return TTI::TCC_Free;
229 }
230 break;
231 case Instruction::Add:
232 case Instruction::Sub:
233 if (Idx == 1 && Imm.getBitWidth() <= 64) {
234 // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
235 if (isUInt<32>(Imm.getZExtValue()))
236 return TTI::TCC_Free;
237 // Or their negation, by swapping addition vs. subtraction.
238 if (isUInt<32>(-Imm.getSExtValue()))
239 return TTI::TCC_Free;
240 }
241 break;
242 case Instruction::Mul:
243 if (Idx == 1 && Imm.getBitWidth() <= 64) {
244 // We use msgfi to multiply by 32-bit signed immediates.
245 if (isInt<32>(Imm.getSExtValue()))
246 return TTI::TCC_Free;
247 }
248 break;
249 case Instruction::Or:
250 case Instruction::Xor:
251 if (Idx == 1 && Imm.getBitWidth() <= 64) {
252 // Masks supported by oilf/xilf.
253 if (isUInt<32>(Imm.getZExtValue()))
254 return TTI::TCC_Free;
255 // Masks supported by oihf/xihf.
256 if ((Imm.getZExtValue() & 0xffffffff) == 0)
257 return TTI::TCC_Free;
258 }
259 break;
260 case Instruction::And:
261 if (Idx == 1 && Imm.getBitWidth() <= 64) {
262 // Any 32-bit AND operation can by implemented via nilf.
263 if (BitSize <= 32)
264 return TTI::TCC_Free;
265 // 64-bit masks supported by nilf.
266 if (isUInt<32>(~Imm.getZExtValue()))
267 return TTI::TCC_Free;
268 // 64-bit masks supported by nilh.
269 if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
270 return TTI::TCC_Free;
271 // Some 64-bit AND operations can be implemented via risbg.
272 const SystemZInstrInfo *TII = ST->getInstrInfo();
273 unsigned Start, End;
274 if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
275 return TTI::TCC_Free;
276 }
277 break;
278 case Instruction::Shl:
279 case Instruction::LShr:
280 case Instruction::AShr:
281 // Always return TCC_Free for the shift value of a shift instruction.
282 if (Idx == 1)
283 return TTI::TCC_Free;
284 break;
285 case Instruction::UDiv:
286 case Instruction::SDiv:
287 case Instruction::URem:
288 case Instruction::SRem:
289 case Instruction::Trunc:
290 case Instruction::ZExt:
291 case Instruction::SExt:
292 case Instruction::IntToPtr:
293 case Instruction::PtrToInt:
294 case Instruction::BitCast:
295 case Instruction::PHI:
296 case Instruction::Call:
297 case Instruction::Select:
298 case Instruction::Ret:
299 case Instruction::Load:
300 break;
301 }
302
304}
305
308 const APInt &Imm, Type *Ty,
310 assert(Ty->isIntegerTy());
311
312 unsigned BitSize = Ty->getPrimitiveSizeInBits();
313 // There is no cost model for constants with a bit size of 0. Return TCC_Free
314 // here, so that constant hoisting will ignore this constant.
315 if (BitSize == 0)
316 return TTI::TCC_Free;
317 // No cost model for operations on integers larger than 64 bit implemented yet.
318 if (BitSize > 64)
319 return TTI::TCC_Free;
320
321 switch (IID) {
322 default:
323 return TTI::TCC_Free;
324 case Intrinsic::sadd_with_overflow:
325 case Intrinsic::uadd_with_overflow:
326 case Intrinsic::ssub_with_overflow:
327 case Intrinsic::usub_with_overflow:
328 // These get expanded to include a normal addition/subtraction.
329 if (Idx == 1 && Imm.getBitWidth() <= 64) {
330 if (isUInt<32>(Imm.getZExtValue()))
331 return TTI::TCC_Free;
332 if (isUInt<32>(-Imm.getSExtValue()))
333 return TTI::TCC_Free;
334 }
335 break;
336 case Intrinsic::smul_with_overflow:
337 case Intrinsic::umul_with_overflow:
338 // These get expanded to include a normal multiplication.
339 if (Idx == 1 && Imm.getBitWidth() <= 64) {
340 if (isInt<32>(Imm.getSExtValue()))
341 return TTI::TCC_Free;
342 }
343 break;
344 case Intrinsic::experimental_stackmap:
345 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
346 return TTI::TCC_Free;
347 break;
348 case Intrinsic::experimental_patchpoint_void:
349 case Intrinsic::experimental_patchpoint:
350 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
351 return TTI::TCC_Free;
352 break;
353 }
355}
356
358SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) const {
359 assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
360 if (ST->hasPopulationCount() && TyWidth <= 64)
362 return TTI::PSK_Software;
363}
364
367 OptimizationRemarkEmitter *ORE) const {
368 // Find out if L contains a call, what the machine instruction count
369 // estimate is, and how many stores there are.
370 bool HasCall = false;
371 InstructionCost NumStores = 0;
372 for (auto &BB : L->blocks())
373 for (auto &I : *BB) {
374 if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
375 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
376 if (isLoweredToCall(F))
377 HasCall = true;
378 if (F->getIntrinsicID() == Intrinsic::memcpy ||
379 F->getIntrinsicID() == Intrinsic::memset)
380 NumStores++;
381 } else { // indirect call.
382 HasCall = true;
383 }
384 }
385 if (isa<StoreInst>(&I)) {
386 Type *MemAccessTy = I.getOperand(0)->getType();
387 NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, Align(),
389 }
390 }
391
392 // The z13 processor will run out of store tags if too many stores
393 // are fed into it too quickly. Therefore make sure there are not
394 // too many stores in the resulting unrolled loop.
395 unsigned const NumStoresVal = NumStores.getValue();
396 unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX);
397
398 if (HasCall) {
399 // Only allow full unrolling if loop has any calls.
400 UP.FullUnrollMaxCount = Max;
401 UP.MaxCount = 1;
402 return;
403 }
404
405 UP.MaxCount = Max;
406 if (UP.MaxCount <= 1)
407 return;
408
409 // Allow partial and runtime trip count unrolling.
410 UP.Partial = UP.Runtime = true;
411
412 UP.PartialThreshold = 75;
414
415 // Allow expensive instructions in the pre-header of the loop.
416 UP.AllowExpensiveTripCount = true;
417
418 UP.Force = true;
419}
420
422 TTI::PeelingPreferences &PP) const {
424}
425
428 const TargetTransformInfo::LSRCost &C2) const {
429 // SystemZ specific: check instruction count (first), and don't care about
430 // ImmCost, since offsets are checked explicitly.
431 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
432 C1.NumIVMuls, C1.NumBaseAdds,
433 C1.ScaleCost, C1.SetupCost) <
434 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
435 C2.NumIVMuls, C2.NumBaseAdds,
436 C2.ScaleCost, C2.SetupCost);
437}
438
439unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
440 bool Vector = (ClassID == 1);
441 if (!Vector)
442 // Discount the stack pointer. Also leave out %r0, since it can't
443 // be used in an address.
444 return 14;
445 if (ST->hasVector())
446 return 32;
447 return 0;
448}
449
452 switch (K) {
454 return TypeSize::getFixed(64);
456 return TypeSize::getFixed(ST->hasVector() ? 128 : 0);
458 return TypeSize::getScalable(0);
459 }
460
461 llvm_unreachable("Unsupported register kind");
462}
463
464unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
465 unsigned NumStridedMemAccesses,
466 unsigned NumPrefetches,
467 bool HasCall) const {
468 // Don't prefetch a loop with many far apart accesses.
469 if (NumPrefetches > 16)
470 return UINT_MAX;
471
472 // Emit prefetch instructions for smaller strides in cases where we think
473 // the hardware prefetcher might not be able to keep up.
474 if (NumStridedMemAccesses > 32 && !HasCall &&
475 (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses)
476 return 1;
477
478 return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
479}
480
481bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const {
482 EVT VT = TLI->getValueType(DL, DataType);
483 return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
484}
485
486static bool isFreeEltLoad(const Value *Op) {
487 if (isa<LoadInst>(Op) && Op->hasOneUse()) {
488 const Instruction *UserI = cast<Instruction>(*Op->user_begin());
489 return !isa<StoreInst>(UserI); // Prefer MVC
490 }
491 return false;
492}
493
495 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
496 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
497 ArrayRef<Value *> VL) const {
498 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
500
501 if (Insert && Ty->isIntOrIntVectorTy(64)) {
502 // VLVGP will insert two GPRs with one instruction, while VLE will load
503 // an element directly with no extra cost
504 assert((VL.empty() || VL.size() == NumElts) &&
505 "Type does not match the number of values.");
506 InstructionCost CurrVectorCost = 0;
507 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
508 if (DemandedElts[Idx] && !(VL.size() && isFreeEltLoad(VL[Idx])))
509 ++CurrVectorCost;
510 if (Idx % 2 == 1) {
511 Cost += std::min(InstructionCost(1), CurrVectorCost);
512 CurrVectorCost = 0;
513 }
514 }
515 Insert = false;
516 }
517
518 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
519 CostKind, ForPoisonSrc, VL);
520 return Cost;
521}
522
523// Return the bit size for the scalar type or vector element
524// type. getScalarSizeInBits() returns 0 for a pointer type.
525static unsigned getScalarSizeInBits(Type *Ty) {
526 unsigned Size =
527 (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
528 assert(Size > 0 && "Element must have non-zero size.");
529 return Size;
530}
531
532// getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
533// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
534// 3.
535static unsigned getNumVectorRegs(Type *Ty) {
536 auto *VTy = cast<FixedVectorType>(Ty);
537 unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
538 assert(WideBits > 0 && "Could not compute size of vector");
539 return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
540}
541
543 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
545 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
546
547 // TODO: Handle more cost kinds.
549 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
550 Op2Info, Args, CxtI);
551
552 // TODO: return a good value for BB-VECTORIZER that includes the
553 // immediate loads, which we do not want to count for the loop
554 // vectorizer, since they are hopefully hoisted out of the loop. This
555 // would require a new parameter 'InLoop', but not sure if constant
556 // args are common enough to motivate this.
557
558 unsigned ScalarBits = Ty->getScalarSizeInBits();
559
560 // There are thre cases of division and remainder: Dividing with a register
561 // needs a divide instruction. A divisor which is a power of two constant
562 // can be implemented with a sequence of shifts. Any other constant needs a
563 // multiply and shifts.
564 const unsigned DivInstrCost = 20;
565 const unsigned DivMulSeqCost = 10;
566 const unsigned SDivPow2Cost = 4;
567
568 bool SignedDivRem =
569 Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
570 bool UnsignedDivRem =
571 Opcode == Instruction::UDiv || Opcode == Instruction::URem;
572
573 // Check for a constant divisor.
574 bool DivRemConst = false;
575 bool DivRemConstPow2 = false;
576 if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
577 if (const Constant *C = dyn_cast<Constant>(Args[1])) {
578 const ConstantInt *CVal =
579 (C->getType()->isVectorTy()
580 ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
581 : dyn_cast<const ConstantInt>(C));
582 if (CVal && (CVal->getValue().isPowerOf2() ||
583 CVal->getValue().isNegatedPowerOf2()))
584 DivRemConstPow2 = true;
585 else
586 DivRemConst = true;
587 }
588 }
589
590 if (!Ty->isVectorTy()) {
591 // These FP operations are supported with a dedicated instruction for
592 // float, double and fp128 (base implementation assumes float generally
593 // costs 2).
594 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
595 Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
596 return 1;
597
598 // There is no native support for FRem.
599 if (Opcode == Instruction::FRem)
600 return LIBCALL_COST;
601
602 // Give discount for some combined logical operations if supported.
603 if (Args.size() == 2) {
604 if (Opcode == Instruction::Xor) {
605 for (const Value *A : Args) {
606 if (const Instruction *I = dyn_cast<Instruction>(A))
607 if (I->hasOneUse() &&
608 (I->getOpcode() == Instruction::Or ||
609 I->getOpcode() == Instruction::And ||
610 I->getOpcode() == Instruction::Xor))
611 if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
612 (isInt128InVR(Ty) &&
613 (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1())))
614 return 0;
615 }
616 }
617 else if (Opcode == Instruction::And || Opcode == Instruction::Or) {
618 for (const Value *A : Args) {
619 if (const Instruction *I = dyn_cast<Instruction>(A))
620 if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) &&
621 ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
622 (isInt128InVR(Ty) &&
623 (Opcode == Instruction::And || ST->hasVectorEnhancements1()))))
624 return 0;
625 }
626 }
627 }
628
629 // Or requires one instruction, although it has custom handling for i64.
630 if (Opcode == Instruction::Or)
631 return 1;
632
633 if (Opcode == Instruction::Xor && ScalarBits == 1) {
634 if (ST->hasLoadStoreOnCond2())
635 return 5; // 2 * (li 0; loc 1); xor
636 return 7; // 2 * ipm sequences ; xor ; shift ; compare
637 }
638
639 if (DivRemConstPow2)
640 return (SignedDivRem ? SDivPow2Cost : 1);
641 if (DivRemConst)
642 return DivMulSeqCost;
643 if (SignedDivRem || UnsignedDivRem)
644 return DivInstrCost;
645 }
646 else if (ST->hasVector()) {
647 auto *VTy = cast<FixedVectorType>(Ty);
648 unsigned VF = VTy->getNumElements();
649 unsigned NumVectors = getNumVectorRegs(Ty);
650
651 // These vector operations are custom handled, but are still supported
652 // with one instruction per vector, regardless of element size.
653 if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
654 Opcode == Instruction::AShr) {
655 return NumVectors;
656 }
657
658 if (DivRemConstPow2)
659 return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
660 if (DivRemConst) {
661 SmallVector<Type *> Tys(Args.size(), Ty);
662 return VF * DivMulSeqCost +
664 }
665 if (SignedDivRem || UnsignedDivRem) {
666 if (ST->hasVectorEnhancements3() && ScalarBits >= 32)
667 return NumVectors * DivInstrCost;
668 else if (VF > 4)
669 // Temporary hack: disable high vectorization factors with integer
670 // division/remainder, which will get scalarized and handled with
671 // GR128 registers. The mischeduler is not clever enough to avoid
672 // spilling yet.
673 return 1000;
674 }
675
676 // These FP operations are supported with a single vector instruction for
677 // double (base implementation assumes float generally costs 2). For
678 // FP128, the scalar cost is 1, and there is no overhead since the values
679 // are already in scalar registers.
680 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
681 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
682 switch (ScalarBits) {
683 case 32: {
684 // The vector enhancements facility 1 provides v4f32 instructions.
685 if (ST->hasVectorEnhancements1())
686 return NumVectors;
687 // Return the cost of multiple scalar invocation plus the cost of
688 // inserting and extracting the values.
689 InstructionCost ScalarCost =
691 SmallVector<Type *> Tys(Args.size(), Ty);
693 (VF * ScalarCost) +
695 // FIXME: VF 2 for these FP operations are currently just as
696 // expensive as for VF 4.
697 if (VF == 2)
698 Cost *= 2;
699 return Cost;
700 }
701 case 64:
702 case 128:
703 return NumVectors;
704 default:
705 break;
706 }
707 }
708
709 // There is no native support for FRem.
710 if (Opcode == Instruction::FRem) {
711 SmallVector<Type *> Tys(Args.size(), Ty);
713 (VF * LIBCALL_COST) +
715 // FIXME: VF 2 for float is currently just as expensive as for VF 4.
716 if (VF == 2 && ScalarBits == 32)
717 Cost *= 2;
718 return Cost;
719 }
720 }
721
722 // Fallback to the default implementation.
723 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
724 Args, CxtI);
725}
726
729 VectorType *SrcTy, ArrayRef<int> Mask,
730 TTI::TargetCostKind CostKind, int Index,
732 const Instruction *CxtI) const {
733 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
734 if (ST->hasVector()) {
735 unsigned NumVectors = getNumVectorRegs(SrcTy);
736
737 // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
738
739 // FP128 values are always in scalar registers, so there is no work
740 // involved with a shuffle, except for broadcast. In that case register
741 // moves are done with a single instruction per element.
742 if (SrcTy->getScalarType()->isFP128Ty())
743 return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
744
745 switch (Kind) {
747 // ExtractSubvector Index indicates start offset.
748
749 // Extracting a subvector from first index is a noop.
750 return (Index == 0 ? 0 : NumVectors);
751
753 // Loop vectorizer calls here to figure out the extra cost of
754 // broadcasting a loaded value to all elements of a vector. Since vlrep
755 // loads and replicates with a single instruction, adjust the returned
756 // value.
757 return NumVectors - 1;
758
759 default:
760
761 // SystemZ supports single instruction permutation / replication.
762 return NumVectors;
763 }
764 }
765
766 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
767 SubTp);
768}
769
770// Return the log2 difference of the element sizes of the two vector types.
771static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
772 unsigned Bits0 = Ty0->getScalarSizeInBits();
773 unsigned Bits1 = Ty1->getScalarSizeInBits();
774
775 if (Bits1 > Bits0)
776 return (Log2_32(Bits1) - Log2_32(Bits0));
777
778 return (Log2_32(Bits0) - Log2_32(Bits1));
779}
780
781// Return the number of instructions needed to truncate SrcTy to DstTy.
782unsigned SystemZTTIImpl::getVectorTruncCost(Type *SrcTy, Type *DstTy) const {
783 assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
786 "Packing must reduce size of vector type.");
787 assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
788 cast<FixedVectorType>(DstTy)->getNumElements() &&
789 "Packing should not change number of elements.");
790
791 // TODO: Since fp32 is expanded, the extract cost should always be 0.
792
793 unsigned NumParts = getNumVectorRegs(SrcTy);
794 if (NumParts <= 2)
795 // Up to 2 vector registers can be truncated efficiently with pack or
796 // permute. The latter requires an immediate mask to be loaded, which
797 // typically gets hoisted out of a loop. TODO: return a good value for
798 // BB-VECTORIZER that includes the immediate loads, which we do not want
799 // to count for the loop vectorizer.
800 return 1;
801
802 unsigned Cost = 0;
803 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
804 unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements();
805 for (unsigned P = 0; P < Log2Diff; ++P) {
806 if (NumParts > 1)
807 NumParts /= 2;
808 Cost += NumParts;
809 }
810
811 // Currently, a general mix of permutes and pack instructions is output by
812 // isel, which follow the cost computation above except for this case which
813 // is one instruction less:
814 if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
815 DstTy->getScalarSizeInBits() == 8)
816 Cost--;
817
818 return Cost;
819}
820
821// Return the cost of converting a vector bitmask produced by a compare
822// (SrcTy), to the type of the select or extend instruction (DstTy).
824 Type *DstTy) const {
825 assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
826 "Should only be called with vector types.");
827
828 unsigned PackCost = 0;
829 unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
830 unsigned DstScalarBits = DstTy->getScalarSizeInBits();
831 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
832 if (SrcScalarBits > DstScalarBits)
833 // The bitmask will be truncated.
834 PackCost = getVectorTruncCost(SrcTy, DstTy);
835 else if (SrcScalarBits < DstScalarBits) {
836 unsigned DstNumParts = getNumVectorRegs(DstTy);
837 // Each vector select needs its part of the bitmask unpacked.
838 PackCost = Log2Diff * DstNumParts;
839 // Extra cost for moving part of mask before unpacking.
840 PackCost += DstNumParts - 1;
841 }
842
843 return PackCost;
844}
845
846// Return the type of the compared operands. This is needed to compute the
847// cost for a Select / ZExt or SExt instruction.
848static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
849 Type *OpTy = nullptr;
850 if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
851 OpTy = CI->getOperand(0)->getType();
852 else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
853 if (LogicI->getNumOperands() == 2)
854 if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
855 if (isa<CmpInst>(LogicI->getOperand(1)))
856 OpTy = CI0->getOperand(0)->getType();
857
858 if (OpTy != nullptr) {
859 if (VF == 1) {
860 assert (!OpTy->isVectorTy() && "Expected scalar type");
861 return OpTy;
862 }
863 // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
864 // be either scalar or already vectorized with a same or lesser VF.
865 Type *ElTy = OpTy->getScalarType();
866 return FixedVectorType::get(ElTy, VF);
867 }
868
869 return nullptr;
870}
871
872// Get the cost of converting a boolean vector to a vector with same width
873// and element size as Dst, plus the cost of zero extending if needed.
874unsigned
876 const Instruction *I) const {
877 auto *DstVTy = cast<FixedVectorType>(Dst);
878 unsigned VF = DstVTy->getNumElements();
879 unsigned Cost = 0;
880 // If we know what the widths of the compared operands, get any cost of
881 // converting it to match Dst. Otherwise assume same widths.
882 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
883 if (CmpOpTy != nullptr)
884 Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
885 if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
886 // One 'vn' per dst vector with an immediate mask.
887 Cost += getNumVectorRegs(Dst);
888 return Cost;
889}
890
892 Type *Src,
895 const Instruction *I) const {
896 // FIXME: Can the logic below also be used for these cost kinds?
898 auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
899 return BaseCost == 0 ? BaseCost : 1;
900 }
901
902 unsigned DstScalarBits = Dst->getScalarSizeInBits();
903 unsigned SrcScalarBits = Src->getScalarSizeInBits();
904
905 if (!Src->isVectorTy()) {
906 if (Dst->isVectorTy())
907 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
908
909 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
910 if (Src->isIntegerTy(128))
911 return LIBCALL_COST;
912 if (SrcScalarBits >= 32 ||
913 (I != nullptr && isa<LoadInst>(I->getOperand(0))))
914 return 1;
915 return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
916 }
917
918 if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) &&
919 Dst->isIntegerTy(128))
920 return LIBCALL_COST;
921
922 if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) {
923 if (Src->isIntegerTy(1)) {
924 if (DstScalarBits == 128) {
925 if (Opcode == Instruction::SExt && ST->hasVectorEnhancements3())
926 return 0;/*VCEQQ*/
927 return 5 /*branch seq.*/;
928 }
929
930 if (ST->hasLoadStoreOnCond2())
931 return 2; // li 0; loc 1
932
933 // This should be extension of a compare i1 result, which is done with
934 // ipm and a varying sequence of instructions.
935 unsigned Cost = 0;
936 if (Opcode == Instruction::SExt)
937 Cost = (DstScalarBits < 64 ? 3 : 4);
938 if (Opcode == Instruction::ZExt)
939 Cost = 3;
940 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
941 if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
942 // If operands of an fp-type was compared, this costs +1.
943 Cost++;
944 return Cost;
945 }
946 else if (isInt128InVR(Dst)) {
947 // Extensions from GPR to i128 (in VR) typically costs two instructions,
948 // but a zero-extending load would be just one extra instruction.
949 if (Opcode == Instruction::ZExt && I != nullptr)
950 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
951 if (Ld->hasOneUse())
952 return 1;
953 return 2;
954 }
955 }
956
957 if (Opcode == Instruction::Trunc && isInt128InVR(Src) && I != nullptr) {
958 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
959 if (Ld->hasOneUse())
960 return 0; // Will be converted to GPR load.
961 bool OnlyTruncatingStores = true;
962 for (const User *U : I->users())
963 if (!isa<StoreInst>(U)) {
964 OnlyTruncatingStores = false;
965 break;
966 }
967 if (OnlyTruncatingStores)
968 return 0;
969 return 2; // Vector element extraction.
970 }
971 }
972 else if (ST->hasVector()) {
973 // Vector to scalar cast.
974 auto *SrcVecTy = cast<FixedVectorType>(Src);
975 auto *DstVecTy = dyn_cast<FixedVectorType>(Dst);
976 if (!DstVecTy) {
977 // TODO: tune vector-to-scalar cast.
978 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
979 }
980 unsigned VF = SrcVecTy->getNumElements();
981 unsigned NumDstVectors = getNumVectorRegs(Dst);
982 unsigned NumSrcVectors = getNumVectorRegs(Src);
983
984 if (Opcode == Instruction::Trunc) {
985 if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
986 return 0; // Check for NOOP conversions.
987 return getVectorTruncCost(Src, Dst);
988 }
989
990 if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
991 if (SrcScalarBits >= 8) {
992 // ZExt will use either a single unpack or a vector permute.
993 if (Opcode == Instruction::ZExt)
994 return NumDstVectors;
995
996 // SExt will be handled with one unpack per doubling of width.
997 unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
998
999 // For types that spans multiple vector registers, some additional
1000 // instructions are used to setup the unpacking.
1001 unsigned NumSrcVectorOps =
1002 (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
1003 : (NumDstVectors / 2));
1004
1005 return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
1006 }
1007 else if (SrcScalarBits == 1)
1008 return getBoolVecToIntConversionCost(Opcode, Dst, I);
1009 }
1010
1011 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
1012 Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
1013 // TODO: Fix base implementation which could simplify things a bit here
1014 // (seems to miss on differentiating on scalar/vector types).
1015
1016 // Only 64 bit vector conversions are natively supported before z15.
1017 if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
1018 if (SrcScalarBits == DstScalarBits)
1019 return NumDstVectors;
1020
1021 if (SrcScalarBits == 1)
1022 return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
1023 }
1024
1025 // Return the cost of multiple scalar invocation plus the cost of
1026 // inserting and extracting the values. Base implementation does not
1027 // realize float->int gets scalarized.
1028 InstructionCost ScalarCost = getCastInstrCost(
1029 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind);
1030 InstructionCost TotCost = VF * ScalarCost;
1031 bool NeedsInserts = true, NeedsExtracts = true;
1032 // FP128 registers do not get inserted or extracted.
1033 if (DstScalarBits == 128 &&
1034 (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
1035 NeedsInserts = false;
1036 if (SrcScalarBits == 128 &&
1037 (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
1038 NeedsExtracts = false;
1039
1040 TotCost += BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
1041 NeedsExtracts, CostKind);
1042 TotCost += BaseT::getScalarizationOverhead(DstVecTy, NeedsInserts,
1043 /*Extract*/ false, CostKind);
1044
1045 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
1046 if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
1047 TotCost *= 2;
1048
1049 return TotCost;
1050 }
1051
1052 if (Opcode == Instruction::FPTrunc) {
1053 if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
1054 return VF /*ldxbr/lexbr*/ +
1055 BaseT::getScalarizationOverhead(DstVecTy, /*Insert*/ true,
1056 /*Extract*/ false, CostKind);
1057 else // double -> float
1058 return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
1059 }
1060
1061 if (Opcode == Instruction::FPExt) {
1062 if (SrcScalarBits == 32 && DstScalarBits == 64) {
1063 // float -> double is very rare and currently unoptimized. Instead of
1064 // using vldeb, which can do two at a time, all conversions are
1065 // scalarized.
1066 return VF * 2;
1067 }
1068 // -> fp128. VF * lxdb/lxeb + extraction of elements.
1069 return VF + BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
1070 /*Extract*/ true, CostKind);
1071 }
1072 }
1073
1074 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1075}
1076
1077// Scalar i8 / i16 operations will typically be made after first extending
1078// the operands to i32.
1079static unsigned getOperandsExtensionCost(const Instruction *I) {
1080 unsigned ExtCost = 0;
1081 for (Value *Op : I->operands())
1082 // A load of i8 or i16 sign/zero extends to i32.
1083 if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op))
1084 ExtCost++;
1085
1086 return ExtCost;
1087}
1088
1090 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
1092 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
1094 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1095 Op1Info, Op2Info);
1096
1097 if (!ValTy->isVectorTy()) {
1098 switch (Opcode) {
1099 case Instruction::ICmp: {
1100 // A loaded value compared with 0 with multiple users becomes Load and
1101 // Test. The load is then not foldable, so return 0 cost for the ICmp.
1102 unsigned ScalarBits = ValTy->getScalarSizeInBits();
1103 if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64))
1104 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
1105 if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
1106 if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
1107 C->isZero())
1108 return 0;
1109
1110 unsigned Cost = 1;
1111 if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
1112 Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
1113 return Cost;
1114 }
1115 case Instruction::Select:
1116 if (ValTy->isFloatingPointTy())
1117 return 4; // No LOC for FP - costs a conditional jump.
1118
1119 // When selecting based on an i128 comparison, LOC / VSEL is possible
1120 // if i128 comparisons are directly supported.
1121 if (I != nullptr)
1122 if (ICmpInst *CI = dyn_cast<ICmpInst>(I->getOperand(0)))
1123 if (CI->getOperand(0)->getType()->isIntegerTy(128))
1124 return ST->hasVectorEnhancements3() ? 1 : 4;
1125
1126 // Load On Condition / Select Register available, except for i128.
1127 return !isInt128InVR(ValTy) ? 1 : 4;
1128 }
1129 }
1130 else if (ST->hasVector()) {
1131 unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements();
1132
1133 // Called with a compare instruction.
1134 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
1135 unsigned PredicateExtraCost = 0;
1136 if (I != nullptr) {
1137 // Some predicates cost one or two extra instructions.
1138 switch (cast<CmpInst>(I)->getPredicate()) {
1144 PredicateExtraCost = 1;
1145 break;
1150 PredicateExtraCost = 2;
1151 break;
1152 default:
1153 break;
1154 }
1155 }
1156
1157 // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
1158 // floats. FIXME: <2 x float> generates same code as <4 x float>.
1159 unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
1160 unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
1161
1162 unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
1163 return Cost;
1164 }
1165 else { // Called with a select instruction.
1166 assert (Opcode == Instruction::Select);
1167
1168 // We can figure out the extra cost of packing / unpacking if the
1169 // instruction was passed and the compare instruction is found.
1170 unsigned PackCost = 0;
1171 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
1172 if (CmpOpTy != nullptr)
1173 PackCost =
1174 getVectorBitmaskConversionCost(CmpOpTy, ValTy);
1175
1176 return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
1177 }
1178 }
1179
1180 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1181 Op1Info, Op2Info);
1182}
1183
1186 unsigned Index,
1187 const Value *Op0,
1188 const Value *Op1) const {
1189 if (Opcode == Instruction::InsertElement) {
1190 // Vector Element Load.
1191 if (Op1 != nullptr && isFreeEltLoad(Op1))
1192 return 0;
1193
1194 // vlvgp will insert two grs into a vector register, so count half the
1195 // number of instructions as an estimate when we don't have the full
1196 // picture (as in getScalarizationOverhead()).
1197 if (Val->isIntOrIntVectorTy(64))
1198 return ((Index % 2 == 0) ? 1 : 0);
1199 }
1200
1201 if (Opcode == Instruction::ExtractElement) {
1202 int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
1203
1204 // Give a slight penalty for moving out of vector pipeline to FXU unit.
1205 if (Index == 0 && Val->isIntOrIntVectorTy())
1206 Cost += 1;
1207
1208 return Cost;
1209 }
1210
1211 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1212}
1213
1214// Check if a load may be folded as a memory operand in its user.
1216 const Instruction *&FoldedValue) const {
1217 if (!Ld->hasOneUse())
1218 return false;
1219 FoldedValue = Ld;
1220 const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
1221 unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
1222 unsigned TruncBits = 0;
1223 unsigned SExtBits = 0;
1224 unsigned ZExtBits = 0;
1225 if (UserI->hasOneUse()) {
1226 unsigned UserBits = UserI->getType()->getScalarSizeInBits();
1227 if (isa<TruncInst>(UserI))
1228 TruncBits = UserBits;
1229 else if (isa<SExtInst>(UserI))
1230 SExtBits = UserBits;
1231 else if (isa<ZExtInst>(UserI))
1232 ZExtBits = UserBits;
1233 }
1234 if (TruncBits || SExtBits || ZExtBits) {
1235 FoldedValue = UserI;
1236 UserI = cast<Instruction>(*UserI->user_begin());
1237 // Load (single use) -> trunc/extend (single use) -> UserI
1238 }
1239 if ((UserI->getOpcode() == Instruction::Sub ||
1240 UserI->getOpcode() == Instruction::SDiv ||
1241 UserI->getOpcode() == Instruction::UDiv) &&
1242 UserI->getOperand(1) != FoldedValue)
1243 return false; // Not commutative, only RHS foldable.
1244 // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1245 // extension was made of the load.
1246 unsigned LoadOrTruncBits =
1247 ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
1248 switch (UserI->getOpcode()) {
1249 case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1250 case Instruction::Sub:
1251 case Instruction::ICmp:
1252 if (LoadedBits == 32 && ZExtBits == 64)
1253 return true;
1254 [[fallthrough]];
1255 case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
1256 if (UserI->getOpcode() != Instruction::ICmp) {
1257 if (LoadedBits == 16 &&
1258 (SExtBits == 32 ||
1259 (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
1260 return true;
1261 if (LoadOrTruncBits == 16)
1262 return true;
1263 }
1264 [[fallthrough]];
1265 case Instruction::SDiv:// SE: 32->64
1266 if (LoadedBits == 32 && SExtBits == 64)
1267 return true;
1268 [[fallthrough]];
1269 case Instruction::UDiv:
1270 case Instruction::And:
1271 case Instruction::Or:
1272 case Instruction::Xor:
1273 // This also makes sense for float operations, but disabled for now due
1274 // to regressions.
1275 // case Instruction::FCmp:
1276 // case Instruction::FAdd:
1277 // case Instruction::FSub:
1278 // case Instruction::FMul:
1279 // case Instruction::FDiv:
1280
1281 // All possible extensions of memory checked above.
1282
1283 // Comparison between memory and immediate.
1284 if (UserI->getOpcode() == Instruction::ICmp)
1285 if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
1286 if (CI->getValue().isIntN(16))
1287 return true;
1288 return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
1289 break;
1290 }
1291 return false;
1292}
1293
1294static bool isBswapIntrinsicCall(const Value *V) {
1295 if (const Instruction *I = dyn_cast<Instruction>(V))
1296 if (auto *CI = dyn_cast<CallInst>(I))
1297 if (auto *F = CI->getCalledFunction())
1298 if (F->getIntrinsicID() == Intrinsic::bswap)
1299 return true;
1300 return false;
1301}
1302
1304 Align Alignment,
1305 unsigned AddressSpace,
1307 TTI::OperandValueInfo OpInfo,
1308 const Instruction *I) const {
1309 assert(!Src->isVoidTy() && "Invalid type");
1310
1311 // TODO: Handle other cost kinds.
1313 return 1;
1314
1315 if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1316 // Store the load or its truncated or extended value in FoldedValue.
1317 const Instruction *FoldedValue = nullptr;
1318 if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
1319 const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
1320 assert (UserI->getNumOperands() == 2 && "Expected a binop.");
1321
1322 // UserI can't fold two loads, so in that case return 0 cost only
1323 // half of the time.
1324 for (unsigned i = 0; i < 2; ++i) {
1325 if (UserI->getOperand(i) == FoldedValue)
1326 continue;
1327
1328 if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
1329 LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
1330 if (!OtherLoad &&
1331 (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
1332 isa<ZExtInst>(OtherOp)))
1333 OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
1334 if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
1335 return i == 0; // Both operands foldable.
1336 }
1337 }
1338
1339 return 0; // Only I is foldable in user.
1340 }
1341 }
1342
1343 // Type legalization (via getNumberOfParts) can't handle structs
1344 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1345 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1346 CostKind);
1347
1348 // FP128 is a legal type but kept in a register pair on older CPUs.
1349 if (Src->isFP128Ty() && !ST->hasVectorEnhancements1())
1350 return 2;
1351
1352 unsigned NumOps =
1353 (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
1354
1355 // Store/Load reversed saves one instruction.
1356 if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) &&
1357 I != nullptr) {
1358 if (Opcode == Instruction::Load && I->hasOneUse()) {
1359 const Instruction *LdUser = cast<Instruction>(*I->user_begin());
1360 // In case of load -> bswap -> store, return normal cost for the load.
1361 if (isBswapIntrinsicCall(LdUser) &&
1362 (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin())))
1363 return 0;
1364 }
1365 else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
1366 const Value *StoredVal = SI->getValueOperand();
1367 if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
1368 return 0;
1369 }
1370 }
1371
1372 return NumOps;
1373}
1374
1375// The generic implementation of getInterleavedMemoryOpCost() is based on
1376// adding costs of the memory operations plus all the extracts and inserts
1377// needed for using / defining the vector operands. The SystemZ version does
1378// roughly the same but bases the computations on vector permutations
1379// instead.
1381 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1382 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1383 bool UseMaskForCond, bool UseMaskForGaps) const {
1384 if (UseMaskForCond || UseMaskForGaps)
1385 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1386 Alignment, AddressSpace, CostKind,
1387 UseMaskForCond, UseMaskForGaps);
1388 assert(isa<VectorType>(VecTy) &&
1389 "Expect a vector type for interleaved memory op");
1390
1391 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1392 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1393 unsigned VF = NumElts / Factor;
1394 unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
1395 unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
1396 unsigned NumPermutes = 0;
1397
1398 if (Opcode == Instruction::Load) {
1399 // Loading interleave groups may have gaps, which may mean fewer
1400 // loads. Find out how many vectors will be loaded in total, and in how
1401 // many of them each value will be in.
1402 BitVector UsedInsts(NumVectorMemOps, false);
1403 std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
1404 for (unsigned Index : Indices)
1405 for (unsigned Elt = 0; Elt < VF; ++Elt) {
1406 unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1407 UsedInsts.set(Vec);
1408 ValueVecs[Index].set(Vec);
1409 }
1410 NumVectorMemOps = UsedInsts.count();
1411
1412 for (unsigned Index : Indices) {
1413 // Estimate that each loaded source vector containing this Index
1414 // requires one operation, except that vperm can handle two input
1415 // registers first time for each dst vector.
1416 unsigned NumSrcVecs = ValueVecs[Index].count();
1417 unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U);
1418 assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1419 NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
1420 }
1421 } else {
1422 // Estimate the permutes for each stored vector as the smaller of the
1423 // number of elements and the number of source vectors. Subtract one per
1424 // dst vector for vperm (S.A.).
1425 unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
1426 unsigned NumDstVecs = NumVectorMemOps;
1427 NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1428 }
1429
1430 // Cost of load/store operations and the permutations needed.
1431 return NumVectorMemOps + NumPermutes;
1432}
1433
1434InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits) {
1436 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1437 Cost += NumVec - 1;
1438 // For integer adds, VSUM creates shorter reductions on the final vector.
1439 Cost += (ScalarBits < 32) ? 3 : 2;
1440 return Cost;
1441}
1442
1443InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems,
1444 unsigned ScalarBits) {
1445 unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits);
1447 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1448 Cost += NumVec - 1;
1449 // For each shuffle / arithmetic layer, we need 2 instructions, and we need
1450 // log2(Elements in Last Vector) layers.
1451 Cost += 2 * Log2_32_Ceil(std::min(NumElems, NumEltsPerVecReg));
1452 return Cost;
1453}
1454
1455inline bool customCostReductions(unsigned Opcode) {
1456 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
1457 Opcode == Instruction::Add || Opcode == Instruction::Mul;
1458}
1459
1462 std::optional<FastMathFlags> FMF,
1464 unsigned ScalarBits = Ty->getScalarSizeInBits();
1465 // The following is only for subtargets with vector math, non-ordered
1466 // reductions, and reasonable scalar sizes for int and fp add/mul.
1467 if (customCostReductions(Opcode) && ST->hasVector() &&
1469 ScalarBits <= SystemZ::VectorBits) {
1470 unsigned NumVectors = getNumVectorRegs(Ty);
1471 unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements();
1472 // Integer Add is using custom code gen, that needs to be accounted for.
1473 if (Opcode == Instruction::Add)
1474 return getIntAddReductionCost(NumVectors, ScalarBits);
1475 // The base cost is the same across all other arithmetic instructions
1477 getFastReductionCost(NumVectors, NumElems, ScalarBits);
1478 // But we need to account for the final op involving the scalar operand.
1479 if ((Opcode == Instruction::FAdd) || (Opcode == Instruction::FMul))
1480 Cost += 1;
1481 return Cost;
1482 }
1483 // otherwise, fall back to the standard implementation
1484 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1485}
1486
1489 FastMathFlags FMF,
1491 // Return custom costs only on subtargets with vector enhancements.
1492 if (ST->hasVectorEnhancements1()) {
1493 unsigned NumVectors = getNumVectorRegs(Ty);
1494 unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements();
1495 unsigned ScalarBits = Ty->getScalarSizeInBits();
1497 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1498 Cost += NumVectors - 1;
1499 // For the final vector, we need shuffle + min/max operations, and
1500 // we need #Elements - 1 of them.
1501 Cost += 2 * (std::min(NumElems, SystemZ::VectorBits / ScalarBits) - 1);
1502 return Cost;
1503 }
1504 // For other targets, fall back to the standard implementation
1505 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1506}
1507
1508static int
1510 const SmallVectorImpl<Type *> &ParamTys) {
1511 if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1512 return getNumVectorRegs(RetTy); // VPERM
1513
1514 return -1;
1515}
1516
1521 ICA.getID(), ICA.getReturnType(), ICA.getArgTypes());
1522 if (Cost != -1)
1523 return Cost;
1525}
1526
1528 // Always expand on Subtargets without vector instructions.
1529 if (!ST->hasVector())
1530 return true;
1531
1532 // Whether or not to expand is a per-intrinsic decision.
1533 switch (II->getIntrinsicID()) {
1534 default:
1535 return true;
1536 // Do not expand vector.reduce.add...
1537 case Intrinsic::vector_reduce_add:
1538 auto *VType = cast<FixedVectorType>(II->getOperand(0)->getType());
1539 // ...unless the scalar size is i64 or larger,
1540 // or the operand vector is not full, since the
1541 // performance benefit is dubious in those cases.
1542 return VType->getScalarSizeInBits() >= 64 ||
1543 VType->getPrimitiveSizeInBits() < SystemZ::VectorBits;
1544 }
1545}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static unsigned InstrCount
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Hexagon Common GEP
const HexagonInstrInfo * TII
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
static unsigned getNumElements(Type *Ty)
#define LLVM_DEBUG(...)
Definition: Debug.h:119
bool customCostReductions(unsigned Opcode)
static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1)
static bool isBswapIntrinsicCall(const Value *V)
InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits)
static void countNumMemAccesses(const Value *Ptr, unsigned &NumStores, unsigned &NumLoads, const Function *F)
static unsigned getOperandsExtensionCost(const Instruction *I)
static Type * getCmpOpsType(const Instruction *I, unsigned VF=1)
static unsigned getScalarSizeInBits(Type *Ty)
static bool isFreeEltLoad(const Value *Op)
InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems, unsigned ScalarBits)
static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, const SmallVectorImpl< Type * > &ParamTys)
static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse)
static unsigned getNumVectorRegs(Type *Ty)
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:449
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
This class represents an incoming formal argument to a Function.
Definition: Argument.h:32
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:888
unsigned getNumberOfParts(Type *Tp) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
Definition: BasicTTIImpl.h:774
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
BitVector & set()
Definition: BitVector.h:351
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1116
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1348
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1292
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:666
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:708
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:702
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:686
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:689
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:687
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:706
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:704
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:688
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:154
This is an important base class in LLVM.
Definition: Constant.h:43
This class represents an Operation in the Expression.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:22
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
This instruction compares its operands according to the predicate given to the constructor.
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:312
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
An instruction for reading from memory.
Definition: Instructions.h:180
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:40
This class wraps the llvm.memcpy intrinsic.
The optimization diagnostic interface.
The main scalar evolution driver.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
const SystemZInstrInfo * getInstrInfo() const override
bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) const
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getNumberOfRegisters(unsigned ClassID) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) const
unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, const Instruction *I) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
bool hasDivRemOp(Type *DataType, bool IsSigned) const override
unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
unsigned adjustInliningThreshold(const CallBase *CB) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual bool isLoweredToCall(const Function *F) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:349
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:273
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:246
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:162
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:270
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
Value * getOperand(unsigned i) const
Definition: User.h:232
unsigned getNumOperands() const
Definition: User.h:254
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
user_iterator user_begin()
Definition: Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
Base class of all SIMD vector types.
Definition: DerivedTypes.h:430
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
const ParentTy * getParent() const
Definition: ilist_node.h:34
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
const unsigned VectorBits
Definition: SystemZ.h:154
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:349
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:336
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
InstructionCost Cost
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:35
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
unsigned Insns
TODO: Some of these could be merged.
Parameters that control the generic loop unrolling transformation.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...