LLVM 22.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
103
105X86TTIImpl::getPopcntSupport(unsigned TyWidth) const {
106 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
107 // TODO: Currently the __builtin_popcount() implementation using SSE3
108 // instructions is inefficient. Once the problem is fixed, we should
109 // call ST->hasSSE3() instead of ST->hasPOPCNT().
110 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
111}
112
113std::optional<unsigned> X86TTIImpl::getCacheSize(
115 switch (Level) {
117 // - Penryn
118 // - Nehalem
119 // - Westmere
120 // - Sandy Bridge
121 // - Ivy Bridge
122 // - Haswell
123 // - Broadwell
124 // - Skylake
125 // - Kabylake
126 return 32 * 1024; // 32 KiB
128 // - Penryn
129 // - Nehalem
130 // - Westmere
131 // - Sandy Bridge
132 // - Ivy Bridge
133 // - Haswell
134 // - Broadwell
135 // - Skylake
136 // - Kabylake
137 return 256 * 1024; // 256 KiB
138 }
139
140 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
141}
142
143std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
145 // - Penryn
146 // - Nehalem
147 // - Westmere
148 // - Sandy Bridge
149 // - Ivy Bridge
150 // - Haswell
151 // - Broadwell
152 // - Skylake
153 // - Kabylake
154 switch (Level) {
156 [[fallthrough]];
158 return 8;
159 }
160
161 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
162}
163
165
167 return Vector ? VectorClass
168 : Ty && Ty->isFloatingPointTy() ? ScalarFPClass
169 : GPRClass;
170}
171
172unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
173 if (ClassID == VectorClass && !ST->hasSSE1())
174 return 0;
175
176 if (!ST->is64Bit())
177 return 8;
178
179 if ((ClassID == GPRClass && ST->hasEGPR()) ||
180 (ClassID != GPRClass && ST->hasAVX512()))
181 return 32;
182
183 return 16;
184}
185
187 if (!ST->hasCF())
188 return false;
189 if (!Ty)
190 return true;
191 // Conditional faulting is supported by CFCMOV, which only accepts
192 // 16/32/64-bit operands.
193 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
194 // profitable.
195 auto *VTy = dyn_cast<FixedVectorType>(Ty);
196 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
197 return false;
198 auto *ScalarTy = Ty->getScalarType();
199 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
200 default:
201 return false;
202 case 16:
203 case 32:
204 case 64:
205 return true;
206 }
207}
208
211 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
212 switch (K) {
214 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
216 if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
217 return TypeSize::getFixed(512);
218 if (ST->hasAVX() && PreferVectorWidth >= 256)
219 return TypeSize::getFixed(256);
220 if (ST->hasSSE1() && PreferVectorWidth >= 128)
221 return TypeSize::getFixed(128);
222 return TypeSize::getFixed(0);
224 return TypeSize::getScalable(0);
225 }
226
227 llvm_unreachable("Unsupported register kind");
228}
229
232 .getFixedValue();
233}
234
236 // If the loop will not be vectorized, don't interleave the loop.
237 // Let regular unroll to unroll the loop, which saves the overflow
238 // check and memory check cost.
239 if (VF.isScalar())
240 return 1;
241
242 if (ST->isAtom())
243 return 1;
244
245 // Sandybridge and Haswell have multiple execution ports and pipelined
246 // vector units.
247 if (ST->hasAVX())
248 return 4;
249
250 return 2;
251}
252
254 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
256 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
257
258 // vXi8 multiplications are always promoted to vXi16.
259 // Sub-128-bit types can be extended/packed more efficiently.
260 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
261 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
262 Type *WideVecTy =
263 VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
264 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
266 CostKind) +
267 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
269 CostKind) +
270 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
271 }
272
273 // Legalize the type.
274 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
275
276 int ISD = TLI->InstructionOpcodeToISD(Opcode);
277 assert(ISD && "Invalid opcode");
278
279 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
280 (LT.second.getScalarType() == MVT::i32 ||
281 LT.second.getScalarType() == MVT::i64)) {
282 // Check if the operands can be represented as a smaller datatype.
283 bool Op1Signed = false, Op2Signed = false;
284 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
285 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
286 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
287 bool SignedMode = Op1Signed || Op2Signed;
288
289 // If both vXi32 are representable as i15 and at least one is constant,
290 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
291 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
292 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
293 LT.second.getScalarType() == MVT::i32) {
294 bool Op1Constant =
295 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
296 bool Op2Constant =
297 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
298 bool Op1Sext = isa<SExtInst>(Args[0]) &&
299 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
300 bool Op2Sext = isa<SExtInst>(Args[1]) &&
301 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
302
303 bool IsZeroExtended = !Op1Signed || !Op2Signed;
304 bool IsConstant = Op1Constant || Op2Constant;
305 bool IsSext = Op1Sext || Op2Sext;
306 if (IsConstant || IsZeroExtended || IsSext)
307 LT.second =
308 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
309 }
310
311 // Check if the vXi32 operands can be shrunk into a smaller datatype.
312 // This should match the codegen from reduceVMULWidth.
313 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
314 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
315 if (OpMinSize <= 7)
316 return LT.first * 3; // pmullw/sext
317 if (!SignedMode && OpMinSize <= 8)
318 return LT.first * 3; // pmullw/zext
319 if (OpMinSize <= 15)
320 return LT.first * 5; // pmullw/pmulhw/pshuf
321 if (!SignedMode && OpMinSize <= 16)
322 return LT.first * 5; // pmullw/pmulhw/pshuf
323 }
324
325 // If both vXi64 are representable as (unsigned) i32, then we can perform
326 // the multiple with a single PMULUDQ instruction.
327 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
328 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
329 ISD = X86ISD::PMULUDQ;
330 }
331
332 // Vector multiply by pow2 will be simplified to shifts.
333 // Vector multiply by -pow2 will be simplified to shifts/negates.
334 if (ISD == ISD::MUL && Op2Info.isConstant() &&
335 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
337 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
338 Op1Info.getNoProps(), Op2Info.getNoProps());
339 if (Op2Info.isNegatedPowerOf2())
340 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
341 return Cost;
342 }
343
344 // On X86, vector signed division by constants power-of-two are
345 // normally expanded to the sequence SRA + SRL + ADD + SRA.
346 // The OperandValue properties may not be the same as that of the previous
347 // operation; conservatively assume OP_None.
348 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
349 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
351 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
352 Op1Info.getNoProps(), Op2Info.getNoProps());
353 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
354 Op1Info.getNoProps(), Op2Info.getNoProps());
355 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
356 Op1Info.getNoProps(), Op2Info.getNoProps());
357
358 if (ISD == ISD::SREM) {
359 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
360 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
361 Op2Info.getNoProps());
362 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
363 Op2Info.getNoProps());
364 }
365
366 return Cost;
367 }
368
369 // Vector unsigned division/remainder will be simplified to shifts/masks.
370 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
371 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
372 if (ISD == ISD::UDIV)
373 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
374 Op1Info.getNoProps(), Op2Info.getNoProps());
375 // UREM
376 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
377 Op1Info.getNoProps(), Op2Info.getNoProps());
378 }
379
380 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
381 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
382 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
383 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
384 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
385 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
386 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
387 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
388 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
389 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
390 };
391
392 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
393 if (const auto *Entry =
394 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
395 if (auto KindCost = Entry->Cost[CostKind])
396 return LT.first * *KindCost;
397
398 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
399 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
400 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
401 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
402 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
403 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
404 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
405 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
406 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
407 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
408
409 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
410 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
411 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
412 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
413 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
414 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
415 };
416
417 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
418 if (const auto *Entry =
419 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
420 if (auto KindCost = Entry->Cost[CostKind])
421 return LT.first * *KindCost;
422
423 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
424 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
425 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
426 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
427
428 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
429 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
430 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
431
432 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
433 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
434 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
435 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
436 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
437 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
438
439 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
440 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
441 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
442 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
443 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
444 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
445 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
446
447 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
448 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
449 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
450 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
451 };
452
453 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
454 if (const auto *Entry =
455 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
456 if (auto KindCost = Entry->Cost[CostKind])
457 return LT.first * *KindCost;
458
459 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
460 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
461 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
462 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
463 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
464 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
465 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
466
467 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
468 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
469 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
470 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
471 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
472 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
473
474 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
475 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
476 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
477 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
478 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
479 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
480
481 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
482 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
483 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
484 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
485 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
486 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
487
488 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
489 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
490 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
491 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
492 };
493
494 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
495 if (const auto *Entry =
496 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
497 if (auto KindCost = Entry->Cost[CostKind])
498 return LT.first * *KindCost;
499
500 static const CostKindTblEntry AVXUniformConstCostTable[] = {
501 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
502 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
503 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
504 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
505 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
506 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
507
508 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
509 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
510 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
511 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
512 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
513 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
514
515 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
516 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
517 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
518 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
519 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
520 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
521
522 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
523 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
524 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
525 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
526 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
527 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
528
529 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
530 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
531 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
532 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
533 };
534
535 // XOP has faster vXi8 shifts.
536 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
537 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
538 if (const auto *Entry =
539 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
540 if (auto KindCost = Entry->Cost[CostKind])
541 return LT.first * *KindCost;
542
543 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
544 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
545 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
546 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
547
548 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
549 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
550 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
551
552 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
553 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
554 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
555
556 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
557 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
558 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
559
560 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
561 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
562 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
563 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
564 };
565
566 // XOP has faster vXi8 shifts.
567 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
568 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
569 if (const auto *Entry =
570 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
571 if (auto KindCost = Entry->Cost[CostKind])
572 return LT.first * *KindCost;
573
574 static const CostKindTblEntry AVX512BWConstCostTable[] = {
575 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
576 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
577 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
578 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
579
580 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
581 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
582 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
583 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
584 };
585
586 if (Op2Info.isConstant() && ST->hasBWI())
587 if (const auto *Entry =
588 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
589 if (auto KindCost = Entry->Cost[CostKind])
590 return LT.first * *KindCost;
591
592 static const CostKindTblEntry AVX512ConstCostTable[] = {
593 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
594 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
595 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
596 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
597
598 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
599 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
600 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
601 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
602
603 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
604 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
605 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
606 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
607 };
608
609 if (Op2Info.isConstant() && ST->hasAVX512())
610 if (const auto *Entry =
611 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
612 if (auto KindCost = Entry->Cost[CostKind])
613 return LT.first * *KindCost;
614
615 static const CostKindTblEntry AVX2ConstCostTable[] = {
616 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
617 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
618 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
619 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
620
621 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
622 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
623 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
624 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
625
626 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
627 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
628 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
629 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
630 };
631
632 if (Op2Info.isConstant() && ST->hasAVX2())
633 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
634 if (auto KindCost = Entry->Cost[CostKind])
635 return LT.first * *KindCost;
636
637 static const CostKindTblEntry AVXConstCostTable[] = {
638 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
639 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
640 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
641 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
642
643 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
644 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
645 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
646 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
647
648 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
649 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
650 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
651 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
652 };
653
654 if (Op2Info.isConstant() && ST->hasAVX())
655 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
656 if (auto KindCost = Entry->Cost[CostKind])
657 return LT.first * *KindCost;
658
659 static const CostKindTblEntry SSE41ConstCostTable[] = {
660 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
661 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
662 };
663
664 if (Op2Info.isConstant() && ST->hasSSE41())
665 if (const auto *Entry =
666 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
667 if (auto KindCost = Entry->Cost[CostKind])
668 return LT.first * *KindCost;
669
670 static const CostKindTblEntry SSE2ConstCostTable[] = {
671 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
672 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
673 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
674 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
675
676 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
677 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
678 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
679 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
680
681 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
682 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
683 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
684 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
685 };
686
687 if (Op2Info.isConstant() && ST->hasSSE2())
688 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
689 if (auto KindCost = Entry->Cost[CostKind])
690 return LT.first * *KindCost;
691
692 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
693 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
694 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
695 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
696 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
697 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
698 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
699 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
700 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
701 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
702
703 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
704 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
705 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
706 };
707
708 if (ST->hasBWI() && Op2Info.isUniform())
709 if (const auto *Entry =
710 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
711 if (auto KindCost = Entry->Cost[CostKind])
712 return LT.first * *KindCost;
713
714 static const CostKindTblEntry AVX512UniformCostTable[] = {
715 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
716 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
717 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
718
719 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
720 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
721 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
722
723 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
724 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
725 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
726 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
727 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
728 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
729 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
730 };
731
732 if (ST->hasAVX512() && Op2Info.isUniform())
733 if (const auto *Entry =
734 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
735 if (auto KindCost = Entry->Cost[CostKind])
736 return LT.first * *KindCost;
737
738 static const CostKindTblEntry AVX2UniformCostTable[] = {
739 // Uniform splats are cheaper for the following instructions.
740 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
741 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
742 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
743 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
744 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
745 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
746
747 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
748 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
749 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
750 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
751 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
752 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
753
754 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
755 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
756 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
757 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
758 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
759 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
760
761 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
762 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
763 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
764 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
765 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
766 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
767 };
768
769 if (ST->hasAVX2() && Op2Info.isUniform())
770 if (const auto *Entry =
771 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
772 if (auto KindCost = Entry->Cost[CostKind])
773 return LT.first * *KindCost;
774
775 static const CostKindTblEntry AVXUniformCostTable[] = {
776 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
777 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
778 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
779 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
780 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
781 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
782
783 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
784 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
785 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
786 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
787 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
788 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
789
790 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
791 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
792 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
793 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
794 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
795 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
796
797 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
798 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
799 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
800 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
801 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
802 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
803 };
804
805 // XOP has faster vXi8 shifts.
806 if (ST->hasAVX() && Op2Info.isUniform() &&
807 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
808 if (const auto *Entry =
809 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
810 if (auto KindCost = Entry->Cost[CostKind])
811 return LT.first * *KindCost;
812
813 static const CostKindTblEntry SSE2UniformCostTable[] = {
814 // Uniform splats are cheaper for the following instructions.
815 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
816 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
817 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
818
819 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
820 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
821 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
822
823 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
824 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
825 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
826
827 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
828 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
829 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
830 };
831
832 if (ST->hasSSE2() && Op2Info.isUniform() &&
833 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
834 if (const auto *Entry =
835 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
836 if (auto KindCost = Entry->Cost[CostKind])
837 return LT.first * *KindCost;
838
839 static const CostKindTblEntry AVX512DQCostTable[] = {
840 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
841 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
842 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
843 };
844
845 // Look for AVX512DQ lowering tricks for custom cases.
846 if (ST->hasDQI())
847 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
848 if (auto KindCost = Entry->Cost[CostKind])
849 return LT.first * *KindCost;
850
851 static const CostKindTblEntry AVX512BWCostTable[] = {
852 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
853 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
854 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
855 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
856 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
857 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
858 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
859 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
860 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
861
862 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
863 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
864 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
865 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
866 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
867 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
868 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
869 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
870 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
871
872 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
873 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
874
875 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
876 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
877 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
878 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
879
880 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
881 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
882
883 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
884 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
885 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
886 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
887
888 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
889 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
890 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
891 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
892 };
893
894 // Look for AVX512BW lowering tricks for custom cases.
895 if (ST->hasBWI())
896 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
897 if (auto KindCost = Entry->Cost[CostKind])
898 return LT.first * *KindCost;
899
900 static const CostKindTblEntry AVX512CostTable[] = {
901 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
902 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
903 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
904
905 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
906 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
907 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
908
909 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
910 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
911 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
912 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
913 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
914 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
915 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
916 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
917 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
918
919 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
920 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
921 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
922 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
923 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
924 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
925 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
926 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
927 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
928
929 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
930 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
931
932 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
933 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
934
935 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
936 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
937 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
938 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
939
940 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
941 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
942 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
943 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
944
945 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
946 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
947 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
948 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
949
950 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
951 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
952 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
953 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
954 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
955
956 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
957
958 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
959 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
960 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
961 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
962 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
964 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
965 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
966 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
967
968 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
969 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
970 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
972
973 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
974 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
975 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
976 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
977 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
979 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
980 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
981 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
982
983 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
984 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
985 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
986 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
987 };
988
989 if (ST->hasAVX512())
990 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
991 if (auto KindCost = Entry->Cost[CostKind])
992 return LT.first * *KindCost;
993
994 static const CostKindTblEntry AVX2ShiftCostTable[] = {
995 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
996 // customize them to detect the cases where shift amount is a scalar one.
997 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
998 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
999 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
1000 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
1001 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
1002 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
1003 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
1004 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
1005 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
1006 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1007 };
1008
1009 if (ST->hasAVX512()) {
1010 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1011 // On AVX512, a packed v32i16 shift left by a constant build_vector
1012 // is lowered into a vector multiply (vpmullw).
1013 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1014 Op1Info.getNoProps(), Op2Info.getNoProps());
1015 }
1016
1017 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1018 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1019 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1020 Op2Info.isConstant())
1021 // On AVX2, a packed v16i16 shift left by a constant build_vector
1022 // is lowered into a vector multiply (vpmullw).
1023 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1024 Op1Info.getNoProps(), Op2Info.getNoProps());
1025
1026 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1027 if (auto KindCost = Entry->Cost[CostKind])
1028 return LT.first * *KindCost;
1029 }
1030
1031 static const CostKindTblEntry XOPShiftCostTable[] = {
1032 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1033 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1034 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1035 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1036 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1037 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1038 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1039 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1040 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1041 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1042 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1043 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1044 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1045 // 256bit shifts require splitting if AVX2 didn't catch them above.
1046 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1047 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1048 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1049 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1050 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1051 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1052 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1053 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1054 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1055 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1056 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1057 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1058 };
1059
1060 // Look for XOP lowering tricks.
1061 if (ST->hasXOP()) {
1062 // If the right shift is constant then we'll fold the negation so
1063 // it's as cheap as a left shift.
1064 int ShiftISD = ISD;
1065 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1066 ShiftISD = ISD::SHL;
1067 if (const auto *Entry =
1068 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1069 if (auto KindCost = Entry->Cost[CostKind])
1070 return LT.first * *KindCost;
1071 }
1072
1073 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1074 MVT VT = LT.second;
1075 // Vector shift left by non uniform constant can be lowered
1076 // into vector multiply.
1077 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1078 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1079 ISD = ISD::MUL;
1080 }
1081
1082 static const CostKindTblEntry GLMCostTable[] = {
1083 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1084 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1085 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1086 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1087 };
1088
1089 if (ST->useGLMDivSqrtCosts())
1090 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1091 if (auto KindCost = Entry->Cost[CostKind])
1092 return LT.first * *KindCost;
1093
1094 static const CostKindTblEntry SLMCostTable[] = {
1095 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1096 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1097 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1098 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1099 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1100 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1101 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1102 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1103 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1104 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1105 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1106 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1107 // v2i64/v4i64 mul is custom lowered as a series of long:
1108 // multiplies(3), shifts(3) and adds(2)
1109 // slm muldq version throughput is 2 and addq throughput 4
1110 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1111 // 3X4 (addq throughput) = 17
1112 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1113 // slm addq\subq throughput is 4
1114 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1115 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1116 };
1117
1118 if (ST->useSLMArithCosts())
1119 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1120 if (auto KindCost = Entry->Cost[CostKind])
1121 return LT.first * *KindCost;
1122
1123 static const CostKindTblEntry AVX2CostTable[] = {
1124 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1125 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1126 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1127 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1128
1129 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1130 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1131 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1132 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1133
1134 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1135 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1136 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1137 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1138 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1139 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1140
1141 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1142 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1143 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1144 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1145 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1146 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1147 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1148 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1149
1150 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1151 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1152 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1153 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1154 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1155 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1156 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1157
1158 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1159
1160 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1161 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1162
1163 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1164 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1165 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1166 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1167 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1168 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1169
1170 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1171 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1172 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1173 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1174 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1175 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1176
1177 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1178 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1179 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1180 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1181 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1182 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1183
1184 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1185 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1186 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1187 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1188 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1189 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1190 };
1191
1192 // Look for AVX2 lowering tricks for custom cases.
1193 if (ST->hasAVX2())
1194 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1195 if (auto KindCost = Entry->Cost[CostKind])
1196 return LT.first * *KindCost;
1197
1198 static const CostKindTblEntry AVX1CostTable[] = {
1199 // We don't have to scalarize unsupported ops. We can issue two half-sized
1200 // operations and we only need to extract the upper YMM half.
1201 // Two ops + 1 extract + 1 insert = 4.
1202 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1203 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1204 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1205 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1206 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1207 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1208
1209 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1210 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1211 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1212 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1213
1214 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1215 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1216 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1217 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1218
1219 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1220 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1221 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1222 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1223
1224 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1225 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1226 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1227 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1228 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1229 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1230 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1231 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1232 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1233 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1234
1235 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1236 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1237 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1238 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1239 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1240 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1241 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1242 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1243
1244 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1245 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1246 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1247 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1248 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1249 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1250 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1251 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1252
1253 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1254 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1255 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1256 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1257 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1258 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1259 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1260 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1261
1262 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1263 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1264
1265 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1266 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1267 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1268 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1269 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1270 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1271
1272 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1273 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1274 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1275 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1276 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1277 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1278
1279 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1280 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1281 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1282 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1283 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1284 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1285
1286 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1287 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1288 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1289 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1290 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1291 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1292 };
1293
1294 if (ST->hasAVX())
1295 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1296 if (auto KindCost = Entry->Cost[CostKind])
1297 return LT.first * *KindCost;
1298
1299 static const CostKindTblEntry SSE42CostTable[] = {
1300 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1301 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1302 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1303 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1304
1305 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1306 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1307 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1308 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1309
1310 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1311 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1312 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1313 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1314
1315 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1316 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1317 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1318 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1319
1320 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1321 };
1322
1323 if (ST->hasSSE42())
1324 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1325 if (auto KindCost = Entry->Cost[CostKind])
1326 return LT.first * *KindCost;
1327
1328 static const CostKindTblEntry SSE41CostTable[] = {
1329 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1330 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1331 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1332
1333 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1334 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1335 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1336 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1337
1338 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1339 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1340 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1341 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1342
1343 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1344 };
1345
1346 if (ST->hasSSE41())
1347 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1348 if (auto KindCost = Entry->Cost[CostKind])
1349 return LT.first * *KindCost;
1350
1351 static const CostKindTblEntry SSSE3CostTable[] = {
1352 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1353 };
1354
1355 if (ST->hasSSSE3())
1356 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1357 if (auto KindCost = Entry->Cost[CostKind])
1358 return LT.first * *KindCost;
1359
1360 static const CostKindTblEntry SSE2CostTable[] = {
1361 // We don't correctly identify costs of casts because they are marked as
1362 // custom.
1363 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1364 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1365 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1366 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1367
1368 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1369 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1370 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1371 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1372
1373 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1374 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1375 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1376 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1377
1378 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1379 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1380 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1381 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1382
1383 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1384 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1385 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1386 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1387
1388 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1389 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1390 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1391 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1392
1393 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1394 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1395
1396 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1397 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1398 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1399 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1400
1401 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1402
1403 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1404 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1405 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1406 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1407
1408 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1410 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1411 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1412
1413 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1414 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1415 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416
1417 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1418 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1419 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1420
1421 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1422 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1423 };
1424
1425 if (ST->hasSSE2())
1426 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1427 if (auto KindCost = Entry->Cost[CostKind])
1428 return LT.first * *KindCost;
1429
1430 static const CostKindTblEntry SSE1CostTable[] = {
1431 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1432 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1433
1434 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1435 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1436
1437 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1438 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1439
1440 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1441 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1442
1443 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1444 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1445 };
1446
1447 if (ST->hasSSE1())
1448 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1449 if (auto KindCost = Entry->Cost[CostKind])
1450 return LT.first * *KindCost;
1451
1452 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1453 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1454 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1455 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1456 };
1457
1458 if (ST->is64Bit())
1459 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1460 if (auto KindCost = Entry->Cost[CostKind])
1461 return LT.first * *KindCost;
1462
1463 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1464 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1465 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1466 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1467
1468 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1469 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1470 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1471
1472 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1473 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1474 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1475
1476 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1477 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1478 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1479 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1480 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1481 };
1482
1483 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1484 if (auto KindCost = Entry->Cost[CostKind])
1485 return LT.first * *KindCost;
1486
1487 // It is not a good idea to vectorize division. We have to scalarize it and
1488 // in the process we will often end up having to spilling regular
1489 // registers. The overhead of division is going to dominate most kernels
1490 // anyways so try hard to prevent vectorization of division - it is
1491 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1492 // to hide "20 cycles" for each lane.
1493 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1494 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1495 ISD == ISD::UREM)) {
1496 InstructionCost ScalarCost =
1498 Op1Info.getNoProps(), Op2Info.getNoProps());
1499 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1500 }
1501
1502 // Handle some basic single instruction code size cases.
1503 if (CostKind == TTI::TCK_CodeSize) {
1504 switch (ISD) {
1505 case ISD::FADD:
1506 case ISD::FSUB:
1507 case ISD::FMUL:
1508 case ISD::FDIV:
1509 case ISD::FNEG:
1510 case ISD::AND:
1511 case ISD::OR:
1512 case ISD::XOR:
1513 return LT.first;
1514 break;
1515 }
1516 }
1517
1518 // Fallback to the default implementation.
1519 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1520 Args, CxtI);
1521}
1522
1525 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1527 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1528 return TTI::TCC_Basic;
1530}
1531
1533 VectorType *DstTy, VectorType *SrcTy,
1534 ArrayRef<int> Mask,
1536 int Index, VectorType *SubTp,
1538 const Instruction *CxtI) const {
1539 assert((Mask.empty() || DstTy->isScalableTy() ||
1540 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1541 "Expected the Mask to match the return size if given");
1542 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1543 "Expected the same scalar types");
1544
1545 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1546 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1547 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1548
1549 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1550
1551 // If all args are constant than this will be constant folded away.
1552 if (!Args.empty() &&
1553 all_of(Args, [](const Value *Arg) { return isa<Constant>(Arg); }))
1554 return TTI::TCC_Free;
1555
1556 // Recognize a basic concat_vector shuffle.
1557 if (Kind == TTI::SK_PermuteTwoSrc &&
1558 Mask.size() == (2 * SrcTy->getElementCount().getKnownMinValue()) &&
1559 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1563 CostKind, Mask.size() / 2, SrcTy);
1564
1565 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1566 if (Kind == TTI::SK_Transpose)
1567 Kind = TTI::SK_PermuteTwoSrc;
1568
1569 if (Kind == TTI::SK_Broadcast) {
1570 // For Broadcasts we are splatting the first element from the first input
1571 // register, so only need to reference that input and all the output
1572 // registers are the same.
1573 LT.first = 1;
1574
1575 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1576 using namespace PatternMatch;
1577 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1578 (ST->hasAVX2() ||
1579 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1580 return TTI::TCC_Free;
1581 }
1582
1583 // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
1584 // permutation.
1585 // Attempt to detect a shuffle mask with a single defined element.
1586 bool IsInLaneShuffle = false;
1587 bool IsSingleElementMask = false;
1588 if (SrcTy->getPrimitiveSizeInBits() > 0 &&
1589 (SrcTy->getPrimitiveSizeInBits() % 128) == 0 &&
1590 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
1591 Mask.size() == SrcTy->getElementCount().getKnownMinValue()) {
1592 unsigned NumLanes = SrcTy->getPrimitiveSizeInBits() / 128;
1593 unsigned NumEltsPerLane = Mask.size() / NumLanes;
1594 if ((Mask.size() % NumLanes) == 0) {
1595 IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
1596 return P.value() == PoisonMaskElem ||
1597 ((P.value() % Mask.size()) / NumEltsPerLane) ==
1598 (P.index() / NumEltsPerLane);
1599 });
1600 IsSingleElementMask =
1601 (Mask.size() - 1) == static_cast<unsigned>(count_if(Mask, [](int M) {
1602 return M == PoisonMaskElem;
1603 }));
1604 }
1605 }
1606
1607 // Treat <X x bfloat> shuffles as <X x half>.
1608 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1609 LT.second = LT.second.changeVectorElementType(MVT::f16);
1610
1611 // Subvector extractions are free if they start at the beginning of a
1612 // vector and cheap if the subvectors are aligned.
1613 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1614 int NumElts = LT.second.getVectorNumElements();
1615 if ((Index % NumElts) == 0)
1616 return TTI::TCC_Free;
1617 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1618 if (SubLT.second.isVector()) {
1619 int NumSubElts = SubLT.second.getVectorNumElements();
1620 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1621 return SubLT.first;
1622 // Handle some cases for widening legalization. For now we only handle
1623 // cases where the original subvector was naturally aligned and evenly
1624 // fit in its legalized subvector type.
1625 // FIXME: Remove some of the alignment restrictions.
1626 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1627 // vectors.
1628 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1629 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1630 (NumSubElts % OrigSubElts) == 0 &&
1631 LT.second.getVectorElementType() ==
1632 SubLT.second.getVectorElementType() &&
1633 LT.second.getVectorElementType().getSizeInBits() ==
1635 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1636 "Unexpected number of elements!");
1637 auto *VecTy = FixedVectorType::get(SrcTy->getElementType(),
1638 LT.second.getVectorNumElements());
1639 auto *SubTy = FixedVectorType::get(SrcTy->getElementType(),
1640 SubLT.second.getVectorNumElements());
1641 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1642 InstructionCost ExtractCost =
1644 ExtractIndex, SubTy);
1645
1646 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1647 // if we have SSSE3 we can use pshufb.
1648 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1649 return ExtractCost + 1; // pshufd or pshufb
1650
1651 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1652 "Unexpected vector size");
1653
1654 return ExtractCost + 2; // worst case pshufhw + pshufd
1655 }
1656 }
1657 // If the extract subvector is not optimal, treat it as single op shuffle.
1659 }
1660
1661 // Subvector insertions are cheap if the subvectors are aligned.
1662 // Note that in general, the insertion starting at the beginning of a vector
1663 // isn't free, because we need to preserve the rest of the wide vector,
1664 // but if the destination vector legalizes to the same width as the subvector
1665 // then the insertion will simplify to a (free) register copy.
1666 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1667 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(DstTy);
1668 int NumElts = DstLT.second.getVectorNumElements();
1669 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1670 if (SubLT.second.isVector()) {
1671 int NumSubElts = SubLT.second.getVectorNumElements();
1672 bool MatchingTypes =
1673 NumElts == NumSubElts &&
1674 (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0;
1675 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1676 return MatchingTypes ? TTI::TCC_Free : SubLT.first;
1677 }
1678
1679 // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have
1680 // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of
1681 // v1f32 (legalised to f32) into a v4f32.
1682 if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 &&
1683 SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41()))
1684 return 1;
1685
1686 // If the insertion is the lowest subvector then it will be blended
1687 // otherwise treat it like a 2-op shuffle.
1688 Kind =
1689 (Index == 0 && LT.first == 1) ? TTI::SK_Select : TTI::SK_PermuteTwoSrc;
1690 }
1691
1692 // Handle some common (illegal) sub-vector types as they are often very cheap
1693 // to shuffle even on targets without PSHUFB.
1694 EVT VT = TLI->getValueType(DL, SrcTy);
1695 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1696 !ST->hasSSSE3()) {
1697 static const CostKindTblEntry SSE2SubVectorShuffleTbl[] = {
1698 {TTI::SK_Broadcast, MVT::v4i16, {1,1,1,1}}, // pshuflw
1699 {TTI::SK_Broadcast, MVT::v2i16, {1,1,1,1}}, // pshuflw
1700 {TTI::SK_Broadcast, MVT::v8i8, {2,2,2,2}}, // punpck/pshuflw
1701 {TTI::SK_Broadcast, MVT::v4i8, {2,2,2,2}}, // punpck/pshuflw
1702 {TTI::SK_Broadcast, MVT::v2i8, {1,1,1,1}}, // punpck
1703
1704 {TTI::SK_Reverse, MVT::v4i16, {1,1,1,1}}, // pshuflw
1705 {TTI::SK_Reverse, MVT::v2i16, {1,1,1,1}}, // pshuflw
1706 {TTI::SK_Reverse, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw/packus
1707 {TTI::SK_Reverse, MVT::v2i8, {1,1,1,1}}, // punpck
1708
1709 {TTI::SK_Splice, MVT::v4i16, {2,2,2,2}}, // punpck+psrldq
1710 {TTI::SK_Splice, MVT::v2i16, {2,2,2,2}}, // punpck+psrldq
1711 {TTI::SK_Splice, MVT::v4i8, {2,2,2,2}}, // punpck+psrldq
1712 {TTI::SK_Splice, MVT::v2i8, {2,2,2,2}}, // punpck+psrldq
1713
1714 {TTI::SK_PermuteTwoSrc, MVT::v4i16, {2,2,2,2}}, // punpck/pshuflw
1715 {TTI::SK_PermuteTwoSrc, MVT::v2i16, {2,2,2,2}}, // punpck/pshuflw
1716 {TTI::SK_PermuteTwoSrc, MVT::v8i8, {7,7,7,7}}, // punpck/pshuflw
1717 {TTI::SK_PermuteTwoSrc, MVT::v4i8, {4,4,4,4}}, // punpck/pshuflw
1718 {TTI::SK_PermuteTwoSrc, MVT::v2i8, {2,2,2,2}}, // punpck
1719
1720 {TTI::SK_PermuteSingleSrc, MVT::v4i16, {1,1,1,1}}, // pshuflw
1721 {TTI::SK_PermuteSingleSrc, MVT::v2i16, {1,1,1,1}}, // pshuflw
1722 {TTI::SK_PermuteSingleSrc, MVT::v8i8, {5,5,5,5}}, // punpck/pshuflw
1723 {TTI::SK_PermuteSingleSrc, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw
1724 {TTI::SK_PermuteSingleSrc, MVT::v2i8, {1,1,1,1}}, // punpck
1725 };
1726
1727 if (ST->hasSSE2())
1728 if (const auto *Entry =
1729 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1730 if (auto KindCost = Entry->Cost[CostKind])
1731 return LT.first * *KindCost;
1732 }
1733
1734 // We are going to permute multiple sources and the result will be in multiple
1735 // destinations. Providing an accurate cost only for splits where the element
1736 // type remains the same.
1737 if (LT.first != 1) {
1738 MVT LegalVT = LT.second;
1739 if (LegalVT.isVector() &&
1740 LegalVT.getVectorElementType().getSizeInBits() ==
1742 LegalVT.getVectorNumElements() <
1743 cast<FixedVectorType>(SrcTy)->getNumElements()) {
1744 unsigned VecTySize = DL.getTypeStoreSize(SrcTy);
1745 unsigned LegalVTSize = LegalVT.getStoreSize();
1746 // Number of source vectors after legalization:
1747 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1748 // Number of destination vectors after legalization:
1749 InstructionCost NumOfDests = LT.first;
1750
1751 auto *SingleOpTy = FixedVectorType::get(SrcTy->getElementType(),
1752 LegalVT.getVectorNumElements());
1753
1754 if (!Mask.empty() && NumOfDests.isValid()) {
1755 // Try to perform better estimation of the permutation.
1756 // 1. Split the source/destination vectors into real registers.
1757 // 2. Do the mask analysis to identify which real registers are
1758 // permuted. If more than 1 source registers are used for the
1759 // destination register building, the cost for this destination register
1760 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1761 // source register is used, build mask and calculate the cost as a cost
1762 // of PermuteSingleSrc.
1763 // Also, for the single register permute we try to identify if the
1764 // destination register is just a copy of the source register or the
1765 // copy of the previous destination register (the cost is
1766 // TTI::TCC_Basic). If the source register is just reused, the cost for
1767 // this operation is TTI::TCC_Free.
1768 NumOfDests =
1770 FixedVectorType::get(SrcTy->getElementType(), Mask.size()))
1771 .first;
1772 unsigned E = NumOfDests.getValue();
1773 unsigned NormalizedVF =
1774 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1775 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1776 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1777 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1778 copy(Mask, NormalizedMask.begin());
1779 unsigned PrevSrcReg = 0;
1780 ArrayRef<int> PrevRegMask;
1783 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1784 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1785 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1786 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1787 // Check if the previous register can be just copied to the next
1788 // one.
1789 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1790 PrevRegMask != RegMask)
1791 Cost +=
1793 SingleOpTy, RegMask, CostKind, 0, nullptr);
1794 else
1795 // Just a copy of previous destination register.
1797 return;
1798 }
1799 if (SrcReg != DestReg &&
1800 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1801 // Just a copy of the source register.
1803 }
1804 PrevSrcReg = SrcReg;
1805 PrevRegMask = RegMask;
1806 },
1807 [this, SingleOpTy, CostKind,
1808 &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/,
1809 unsigned /*Unused*/, bool /*Unused*/) {
1811 SingleOpTy, RegMask, CostKind, 0, nullptr);
1812 });
1813 return Cost;
1814 }
1815
1816 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1817 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1818 SingleOpTy, {}, CostKind, 0,
1819 nullptr);
1820 }
1821
1822 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1823 SubTp);
1824 }
1825
1826 // If we're just moving a single element around (probably as an alternative to
1827 // extracting it), we can assume this is cheap.
1828 if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask)
1829 return TTI::TCC_Basic;
1830
1831 static const CostKindTblEntry AVX512VBMIShuffleTbl[] = {
1832 { TTI::SK_Reverse, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1833 { TTI::SK_Reverse, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1834 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1835 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1836 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 2, 2, 2, 2 } }, // vpermt2b
1837 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // vpermt2b
1838 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 2, 2, 2, 2 } } // vpermt2b
1839 };
1840
1841 if (ST->hasVBMI())
1842 if (const auto *Entry =
1843 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1844 if (auto KindCost = Entry->Cost[CostKind])
1845 return LT.first * *KindCost;
1846
1847 static const CostKindTblEntry AVX512BWShuffleTbl[] = {
1848 { TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1849 { TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1850 { TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1851
1852 { TTI::SK_Reverse, MVT::v32i16, { 2, 6, 2, 4 } }, // vpermw
1853 { TTI::SK_Reverse, MVT::v32f16, { 2, 6, 2, 4 } }, // vpermw
1854 { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1855 { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1856 { TTI::SK_Reverse, MVT::v64i8, { 2, 9, 2, 3 } }, // pshufb + vshufi64x2
1857
1858 { TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
1859 { TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
1860 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1861 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1862 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 8, 8, 8, 8 } }, // extend to v32i16
1863
1864 { TTI::SK_PermuteTwoSrc, MVT::v32i16,{ 2, 2, 2, 2 } }, // vpermt2w
1865 { TTI::SK_PermuteTwoSrc, MVT::v32f16,{ 2, 2, 2, 2 } }, // vpermt2w
1866 { TTI::SK_PermuteTwoSrc, MVT::v16i16,{ 2, 2, 2, 2 } }, // vpermt2w
1867 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 2, 2, 2, 2 } }, // vpermt2w
1868 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 19, 19, 19, 19 } }, // 6 * v32i8 + 1
1869
1870 { TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vblendmw
1871 { TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vblendmb
1872
1873 { TTI::SK_Splice, MVT::v32i16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1874 { TTI::SK_Splice, MVT::v32f16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1875 { TTI::SK_Splice, MVT::v64i8, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1876 };
1877
1878 if (ST->hasBWI())
1879 if (const auto *Entry =
1880 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1881 if (auto KindCost = Entry->Cost[CostKind])
1882 return LT.first * *KindCost;
1883
1884 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1885 {TTI::SK_Broadcast, MVT::v8f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1886 {TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1887 {TTI::SK_Broadcast, MVT::v16f32, { 1, 3, 1, 1 } }, // vbroadcastss
1888 {TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 1 } }, // vbroadcastss
1889 {TTI::SK_Broadcast, MVT::v8i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1890 {TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1891 {TTI::SK_Broadcast, MVT::v16i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1892 {TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1893 {TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1894 {TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1895 {TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1896 {TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1897 {TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1898 {TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 1 }}, // vpbroadcastb
1899
1900 {TTI::SK_Reverse, MVT::v8f64, { 1, 5, 2, 3 } }, // vpermpd
1901 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 2, 3 } }, // vpermps
1902 {TTI::SK_Reverse, MVT::v8i64, { 1, 5, 2, 3 } }, // vpermq
1903 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 2, 3 } }, // vpermd
1904 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1905 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1906 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1907
1908 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1909 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1910 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1911 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1912 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1913 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1914 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1915 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1916 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1917 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1918 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1919
1920 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1921 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1922 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1923 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1924 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1925 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1926 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1927 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1928 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1929 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1930 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1931 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1932 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1933
1934 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1935 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1936 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1937 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1938 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1939 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1940 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1941 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1942 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1943 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1944 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1945 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1946
1947 // FIXME: This just applies the type legalization cost rules above
1948 // assuming these completely split.
1949 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1950 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1951 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1952 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1953 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1954 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1955
1956 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1957 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1958 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1959 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1960 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1961 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1962 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1963 };
1964
1965 if (ST->hasAVX512())
1966 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1967 if (auto KindCost = Entry->Cost[CostKind])
1968 return LT.first * *KindCost;
1969
1970 static const CostKindTblEntry AVX2InLaneShuffleTbl[] = {
1971 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 1, 1, 1, 1 } }, // vpshufb
1972 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 1, 1, 1, 1 } }, // vpshufb
1973 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpshufb
1974
1975 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
1976 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
1977 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
1978 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
1979 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1980 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1981 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1982 };
1983
1984 if (IsInLaneShuffle && ST->hasAVX2())
1985 if (const auto *Entry =
1986 CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second))
1987 if (auto KindCost = Entry->Cost[CostKind])
1988 return LT.first * *KindCost;
1989
1990 static const CostKindTblEntry AVX2ShuffleTbl[] = {
1991 { TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 2 } }, // vbroadcastpd
1992 { TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 2 } }, // vbroadcastps
1993 { TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 2 } }, // vpbroadcastq
1994 { TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 2 } }, // vpbroadcastd
1995 { TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 2 } }, // vpbroadcastw
1996 { TTI::SK_Broadcast, MVT::v8i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1997 { TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 2 } }, // vpbroadcastw
1998 { TTI::SK_Broadcast, MVT::v8f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1999 { TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 2 } }, // vpbroadcastb
2000 { TTI::SK_Broadcast, MVT::v16i8, { 1, 3, 1, 1 } }, // vpbroadcastb
2001
2002 { TTI::SK_Reverse, MVT::v4f64, { 1, 6, 1, 2 } }, // vpermpd
2003 { TTI::SK_Reverse, MVT::v8f32, { 2, 7, 2, 4 } }, // vpermps
2004 { TTI::SK_Reverse, MVT::v4i64, { 1, 6, 1, 2 } }, // vpermq
2005 { TTI::SK_Reverse, MVT::v8i32, { 2, 7, 2, 4 } }, // vpermd
2006 { TTI::SK_Reverse, MVT::v16i16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2007 { TTI::SK_Reverse, MVT::v16f16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2008 { TTI::SK_Reverse, MVT::v32i8, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2009
2010 { TTI::SK_Select, MVT::v16i16, { 1, 1, 1, 1 } }, // vpblendvb
2011 { TTI::SK_Select, MVT::v16f16, { 1, 1, 1, 1 } }, // vpblendvb
2012 { TTI::SK_Select, MVT::v32i8, { 1, 1, 1, 1 } }, // vpblendvb
2013
2014 { TTI::SK_Splice, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2015 { TTI::SK_Splice, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2016 { TTI::SK_Splice, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2017 { TTI::SK_Splice, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2018 { TTI::SK_Splice, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2019
2020 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd
2021 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps
2022 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq
2023 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd
2024 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } },
2025 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } },
2026 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } },
2027
2028 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 3, 3, 3, 3 } }, // 2*vpermpd + vblendpd
2029 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 3, 3, 3, 3 } }, // 2*vpermps + vblendps
2030 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 3, 3, 3, 3 } }, // 2*vpermq + vpblendd
2031 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 3, 3, 3, 3 } }, // 2*vpermd + vpblendd
2032 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 7, 7, 7, 7 } },
2033 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 7, 7, 7, 7 } },
2034 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 7, 7, 7, 7 } },
2035 };
2036
2037 if (ST->hasAVX2())
2038 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
2039 if (auto KindCost = Entry->Cost[CostKind])
2040 return LT.first * *KindCost;
2041
2042 static const CostKindTblEntry XOPShuffleTbl[] = {
2043 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2044 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2045 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2046 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2047 { TTI::SK_PermuteSingleSrc, MVT::v16i16,{ 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2048 // + vinsertf128
2049 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2050 // + vinsertf128
2051
2052 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2053 // + vinsertf128
2054
2055 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 1, 1, 1, 1 } }, // vpperm
2056 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2057 // + vinsertf128
2058 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 1, 1, 1, 1 } }, // vpperm
2059 };
2060
2061 if (ST->hasXOP())
2062 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
2063 if (auto KindCost = Entry->Cost[CostKind])
2064 return LT.first * *KindCost;
2065
2066 static const CostKindTblEntry AVX1InLaneShuffleTbl[] = {
2067 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermilpd
2068 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermilpd
2069 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermilps
2070 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermilps
2071
2072 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2073 // + vpor + vinsertf128
2074 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2075 // + vpor + vinsertf128
2076 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2077 // + vpor + vinsertf128
2078
2079 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
2080 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
2081 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpermilpd + vblendpd
2082 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpermilps + vblendps
2083 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2084 // + 2*vpor + vinsertf128
2085 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2086 // + 2*vpor + vinsertf128
2087 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2088 // + 2*vpor + vinsertf128
2089 };
2090
2091 if (IsInLaneShuffle && ST->hasAVX())
2092 if (const auto *Entry =
2093 CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second))
2094 if (auto KindCost = Entry->Cost[CostKind])
2095 return LT.first * *KindCost;
2096
2097 static const CostKindTblEntry AVX1ShuffleTbl[] = {
2098 {TTI::SK_Broadcast, MVT::v4f64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2099 {TTI::SK_Broadcast, MVT::v8f32, {2,3,2,3}}, // vperm2f128 + vpermilps
2100 {TTI::SK_Broadcast, MVT::v4i64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2101 {TTI::SK_Broadcast, MVT::v8i32, {2,3,2,3}}, // vperm2f128 + vpermilps
2102 {TTI::SK_Broadcast, MVT::v16i16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2103 {TTI::SK_Broadcast, MVT::v16f16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2104 {TTI::SK_Broadcast, MVT::v32i8, {3,4,3,6}}, // vpshufb + vinsertf128
2105
2106 {TTI::SK_Reverse, MVT::v4f64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2107 {TTI::SK_Reverse, MVT::v8f32, {2,7,2,4}}, // vperm2f128 + vpermilps
2108 {TTI::SK_Reverse, MVT::v4i64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2109 {TTI::SK_Reverse, MVT::v8i32, {2,7,2,4}}, // vperm2f128 + vpermilps
2110 {TTI::SK_Reverse, MVT::v16i16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2111 // + vinsertf128
2112 {TTI::SK_Reverse, MVT::v16f16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2113 // + vinsertf128
2114 {TTI::SK_Reverse, MVT::v32i8, {2,9,5,5}}, // vextractf128 + 2*pshufb
2115 // + vinsertf128
2116
2117 {TTI::SK_Select, MVT::v4i64, {1,1,1,1}}, // vblendpd
2118 {TTI::SK_Select, MVT::v4f64, {1,1,1,1}}, // vblendpd
2119 {TTI::SK_Select, MVT::v8i32, {1,1,1,1}}, // vblendps
2120 {TTI::SK_Select, MVT::v8f32, {1,1,1,1}}, // vblendps
2121 {TTI::SK_Select, MVT::v16i16, {3,3,3,3}}, // vpand + vpandn + vpor
2122 {TTI::SK_Select, MVT::v16f16, {3,3,3,3}}, // vpand + vpandn + vpor
2123 {TTI::SK_Select, MVT::v32i8, {3,3,3,3}}, // vpand + vpandn + vpor
2124
2125 {TTI::SK_Splice, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + shufpd
2126 {TTI::SK_Splice, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + shufpd
2127 {TTI::SK_Splice, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2128 {TTI::SK_Splice, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2129 {TTI::SK_Splice, MVT::v16i16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2130 {TTI::SK_Splice, MVT::v16f16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2131 {TTI::SK_Splice, MVT::v32i8, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2132
2133 {TTI::SK_PermuteSingleSrc, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vshufpd
2134 {TTI::SK_PermuteSingleSrc, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vshufpd
2135 {TTI::SK_PermuteSingleSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2136 {TTI::SK_PermuteSingleSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2137 {TTI::SK_PermuteSingleSrc, MVT::v16i16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2138 // + 2*por + vinsertf128
2139 {TTI::SK_PermuteSingleSrc, MVT::v16f16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2140 // + 2*por + vinsertf128
2141 {TTI::SK_PermuteSingleSrc, MVT::v32i8, {8,8,8,8}}, // vextractf128 + 4*pshufb
2142 // + 2*por + vinsertf128
2143
2144 {TTI::SK_PermuteTwoSrc, MVT::v4f64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2145 {TTI::SK_PermuteTwoSrc, MVT::v4i64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2146 {TTI::SK_PermuteTwoSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2147 {TTI::SK_PermuteTwoSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2148 {TTI::SK_PermuteTwoSrc, MVT::v16i16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2149 // + 4*por + vinsertf128
2150 {TTI::SK_PermuteTwoSrc, MVT::v16f16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2151 // + 4*por + vinsertf128
2152 {TTI::SK_PermuteTwoSrc, MVT::v32i8, {15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2153 // + 4*por + vinsertf128
2154 };
2155
2156 if (ST->hasAVX())
2157 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2158 if (auto KindCost = Entry->Cost[CostKind])
2159 return LT.first * *KindCost;
2160
2161 static const CostKindTblEntry SSE41ShuffleTbl[] = {
2162 {TTI::SK_Select, MVT::v2i64, {1,1,1,1}}, // pblendw
2163 {TTI::SK_Select, MVT::v2f64, {1,1,1,1}}, // movsd
2164 {TTI::SK_Select, MVT::v4i32, {1,1,1,1}}, // pblendw
2165 {TTI::SK_Select, MVT::v4f32, {1,1,1,1}}, // blendps
2166 {TTI::SK_Select, MVT::v8i16, {1,1,1,1}}, // pblendw
2167 {TTI::SK_Select, MVT::v8f16, {1,1,1,1}}, // pblendw
2168 {TTI::SK_Select, MVT::v16i8, {1,1,1,1}} // pblendvb
2169 };
2170
2171 if (ST->hasSSE41())
2172 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2173 if (auto KindCost = Entry->Cost[CostKind])
2174 return LT.first * *KindCost;
2175
2176 static const CostKindTblEntry SSSE3ShuffleTbl[] = {
2177 {TTI::SK_Broadcast, MVT::v8i16, {1, 3, 2, 2}}, // pshufb
2178 {TTI::SK_Broadcast, MVT::v8f16, {1, 3, 2, 2}}, // pshufb
2179 {TTI::SK_Broadcast, MVT::v16i8, {1, 3, 2, 2}}, // pshufb
2180
2181 {TTI::SK_Reverse, MVT::v8i16, {1, 2, 1, 2}}, // pshufb
2182 {TTI::SK_Reverse, MVT::v8f16, {1, 2, 1, 2}}, // pshufb
2183 {TTI::SK_Reverse, MVT::v16i8, {1, 2, 1, 2}}, // pshufb
2184
2185 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2186 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2187 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2188
2189 {TTI::SK_Splice, MVT::v4i32, {1, 1, 1, 1}}, // palignr
2190 {TTI::SK_Splice, MVT::v4f32, {1, 1, 1, 1}}, // palignr
2191 {TTI::SK_Splice, MVT::v8i16, {1, 1, 1, 1}}, // palignr
2192 {TTI::SK_Splice, MVT::v8f16, {1, 1, 1, 1}}, // palignr
2193 {TTI::SK_Splice, MVT::v16i8, {1, 1, 1, 1}}, // palignr
2194
2195 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
2196 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
2197 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
2198
2199 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2200 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2201 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2202 };
2203
2204 if (ST->hasSSSE3())
2205 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2206 if (auto KindCost = Entry->Cost[CostKind])
2207 return LT.first * *KindCost;
2208
2209 static const CostKindTblEntry SSE2ShuffleTbl[] = {
2210 {TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2211 {TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2212 {TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2213 {TTI::SK_Broadcast, MVT::v8i16, {1, 2, 2, 2}}, // pshuflw + pshufd
2214 {TTI::SK_Broadcast, MVT::v8f16, {1, 2, 2, 2}}, // pshuflw + pshufd
2215 {TTI::SK_Broadcast, MVT::v16i8, {2, 3, 3, 4}}, // unpck + pshuflw + pshufd
2216
2217 {TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2218 {TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2219 {TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2220 {TTI::SK_Reverse, MVT::v8i16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2221 {TTI::SK_Reverse, MVT::v8f16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2222 {TTI::SK_Reverse, MVT::v16i8, {5, 6,11,11}}, // 2*pshuflw + 2*pshufhw
2223 // + 2*pshufd + 2*unpck + packus
2224
2225 {TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd
2226 {TTI::SK_Select, MVT::v2f64, {1, 1, 1, 1}}, // movsd
2227 {TTI::SK_Select, MVT::v4i32, {2, 2, 2, 2}}, // 2*shufps
2228 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // pand + pandn + por
2229 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // pand + pandn + por
2230 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // pand + pandn + por
2231
2232 {TTI::SK_Splice, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2233 {TTI::SK_Splice, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2234 {TTI::SK_Splice, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2235 {TTI::SK_Splice, MVT::v8i16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2236 {TTI::SK_Splice, MVT::v8f16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2237 {TTI::SK_Splice, MVT::v16i8, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2238
2239 {TTI::SK_PermuteSingleSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2240 {TTI::SK_PermuteSingleSrc, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2241 {TTI::SK_PermuteSingleSrc, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2242 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2243 // + pshufd/unpck
2244 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2245 // + pshufd/unpck
2246 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {8, 10, 10, 10}}, // 2*pshuflw + 2*pshufhw
2247 // + 2*pshufd + 2*unpck + 2*packus
2248
2249 {TTI::SK_PermuteTwoSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2250 {TTI::SK_PermuteTwoSrc, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2251 {TTI::SK_PermuteTwoSrc, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2252 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {6, 8, 8, 8}}, // blend+permute
2253 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {6, 8, 8, 8}}, // blend+permute
2254 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {11, 13, 13, 13}}, // blend+permute
2255 };
2256
2257 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2258 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2259 };
2260
2261 if (ST->hasSSE2()) {
2262 bool IsLoad =
2263 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2264 if (ST->hasSSE3() && IsLoad)
2265 if (const auto *Entry =
2266 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2268 LT.second.getVectorElementCount()) &&
2269 "Table entry missing from isLegalBroadcastLoad()");
2270 return LT.first * Entry->Cost;
2271 }
2272
2273 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2274 if (auto KindCost = Entry->Cost[CostKind])
2275 return LT.first * *KindCost;
2276 }
2277
2278 static const CostKindTblEntry SSE1ShuffleTbl[] = {
2279 { TTI::SK_Broadcast, MVT::v4f32, {1,1,1,1} }, // shufps
2280 { TTI::SK_Reverse, MVT::v4f32, {1,1,1,1} }, // shufps
2281 { TTI::SK_Select, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2282 { TTI::SK_Splice, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2283 { TTI::SK_PermuteSingleSrc, MVT::v4f32, {1,1,1,1} }, // shufps
2284 { TTI::SK_PermuteTwoSrc, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2285 };
2286
2287 if (ST->hasSSE1()) {
2288 if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) {
2289 // SHUFPS: both pairs must come from the same source register.
2290 auto MatchSHUFPS = [](int X, int Y) {
2291 return X < 0 || Y < 0 || ((X & 4) == (Y & 4));
2292 };
2293 if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3]))
2294 return 1;
2295 }
2296 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2297 if (auto KindCost = Entry->Cost[CostKind])
2298 return LT.first * *KindCost;
2299 }
2300
2301 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
2302 SubTp);
2303}
2304
2306 Type *Src,
2309 const Instruction *I) const {
2310 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2311 assert(ISD && "Invalid opcode");
2312
2313 // The cost tables include both specific, custom (non-legal) src/dst type
2314 // conversions and generic, legalized types. We test for customs first, before
2315 // falling back to legalization.
2316 // FIXME: Need a better design of the cost table to handle non-simple types of
2317 // potential massive combinations (elem_num x src_type x dst_type).
2318 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2319 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2320 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2321
2322 // Mask sign extend has an instruction.
2323 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2324 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2325 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2326 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2327 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2328 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2329 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2330 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2331 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2332 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2333 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2334 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2335 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2336 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2337 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2338 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2339 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2340
2341 // Mask zero extend is a sext + shift.
2342 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2343 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2344 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2345 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2346 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2347 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2348 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2349 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2350 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2351 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2352 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2353 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2354 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2355 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2356 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2357 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2358 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2359
2360 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2361 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2362 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2363 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2364 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2365 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2366 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2367 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2368 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2369 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2370 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2371 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2372 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2373 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2374 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2375 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2376 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2377
2378 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2379 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2380 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2381 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2382 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2383 };
2384
2385 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2386 // Mask sign extend has an instruction.
2387 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2388 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2389 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2390 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2391 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2392 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2393 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2394 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2395
2396 // Mask zero extend is a sext + shift.
2397 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2398 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2399 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2400 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2401 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2402 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2403 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2404 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2405
2406 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2407 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2408 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2409 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2410 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2411 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2412 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2413 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2414
2415 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2416 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2417
2418 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2419 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2420
2421 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2422 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2423
2424 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2425 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2426 };
2427
2428 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2429 // 256-bit wide vectors.
2430
2431 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2432 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2433 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2434 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2435 { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps
2436 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
2437 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2438 { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph
2439
2440 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2441 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2442 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2443 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2444 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2445 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2446 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2447 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2448 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2449 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2450 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2451 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2452 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2453 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2454 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2455 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2456 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2457 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2458 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2459 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2460 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2461 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2462 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2463 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2464 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2465 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2466 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2467 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2468 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2469 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2470 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2471 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2472 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2473 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2474
2475 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2476 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2477 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2478
2479 // Sign extend is zmm vpternlogd+vptruncdb.
2480 // Zero extend is zmm broadcast load+vptruncdw.
2481 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2482 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2483 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2484 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2485 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2486 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2487 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2488 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2489
2490 // Sign extend is zmm vpternlogd+vptruncdw.
2491 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2492 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2493 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2494 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2495 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2496 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2497 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2498 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2499 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2500
2501 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2502 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2503 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2504 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2505 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2506 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2507 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2508 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2509 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2510 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2511
2512 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2513 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2514 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2515 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2516
2517 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2518 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2519 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2520 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2521 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2522 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2523 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2524 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2525 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2526 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2527
2528 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2529 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2530
2531 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2532 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2533 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2534 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2535 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2536 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2537 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2538 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2539
2540 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2541 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2542 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2543 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2544 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2545 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2546 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2547 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2548 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2549 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2550
2551 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2552 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2553 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2554 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2555 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2556 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2557 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2558 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2559 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2560 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2561 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2562
2563 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2564 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2565 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2566 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2567 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2568 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2569 };
2570
2571 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2572 // Mask sign extend has an instruction.
2573 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2574 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2575 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2576 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2577 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2578 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2579 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2580 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2581 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2582 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2583 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2584 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2585 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2586 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2587 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2588 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2589 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2590
2591 // Mask zero extend is a sext + shift.
2592 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2593 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2594 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2595 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2596 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2597 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2598 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2599 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2600 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2601 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2602 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2603 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2604 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2605 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2606 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2607 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2608 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2609
2610 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2611 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2612 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2613 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2614 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2615 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2616 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2617 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2618 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2619 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2620 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2621 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2622 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2623 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2624 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2625 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2626 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2627
2628 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2629 };
2630
2631 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2632 // Mask sign extend has an instruction.
2633 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2634 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2635 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2636 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2637 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2638 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2639 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2640 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2641
2642 // Mask zero extend is a sext + shift.
2643 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2644 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2645 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2646 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2647 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2648 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2649 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2650 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2651
2652 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2653 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2654 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2655 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2656 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2657 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2658 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2659 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2660
2661 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2662 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2663 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2664 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2665
2666 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2667 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2668 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2669 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2670
2671 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2672 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2673 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2674 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2675
2676 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2677 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2678 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2679 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2680 };
2681
2682 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2683 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2684 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2685 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2686 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2687 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2688 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2689 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2690 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2691 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2692 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2693 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2694 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2695 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2696 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2697 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2698 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2699 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2700 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2701
2702 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2703 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2704 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2705 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2706 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2707 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2708 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2709 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2710 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2711 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2712
2713 // sign extend is vpcmpeq+maskedmove+vpmovdw
2714 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2715 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2716 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2717 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2718 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2719 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2720 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2721 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2722 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2723
2724 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2725 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2726 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2727 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2728 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2729 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2730 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2731 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2732
2733 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2734 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2735 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2736 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2737
2738 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2739 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2740 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2741 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2742 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2743 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2744 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2745 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2746 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2747 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2748 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2749 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2750
2751 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2752 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2753 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2754 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2755
2756 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2757 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2758 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2759 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2760 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2761 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2762 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2763 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2764 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2765 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2766 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2767 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2768 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2769
2770 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2771 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2772 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2773
2774 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2775 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2776 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2777 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2778 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2779 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2780 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2781 };
2782
2783 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2784 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2785 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2786 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2787 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2788 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2789 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2790
2791 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2792 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2793 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2794 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2795 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2796 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2797 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2798 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2799 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2800 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2801 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2802 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2803 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2804 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2805
2806 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2807
2808 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2809 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2810 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2811 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2812 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2813 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2814 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2815 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2816 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2817 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2818 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2819 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2820
2821 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2822 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2823
2824 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2825 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2826 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2827 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2828
2829 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2830 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2831 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2832 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2833 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2834 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2835 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2836 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2837
2838 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2839 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2840 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2841 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2842 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2843 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2844 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2845
2846 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2847 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2848 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2849 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2850 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2851 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2852 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2853 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2854 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2855 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2856 };
2857
2858 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2859 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2860 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2861 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2862 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2863 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2864 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2865
2866 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2867 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2868 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2869 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2870 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2871 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2872 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2873 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2874 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2875 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2876 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2877 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2878
2879 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2880 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2881 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2882 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2883 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2884
2885 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2886 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2887 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2888 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2889 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2890 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2891 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2892 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2893
2894 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2895 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2896 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2897 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2898 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2899 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2900 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2901 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2902 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2903 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2904 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2905 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2906
2907 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2908 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2909 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2910 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2911 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2912 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2913 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2914 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2915 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2916 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2917 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2918 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2919 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2920 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2921 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2922 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2923 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2924
2925 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2926 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2927 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2928 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2929 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2930 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2931 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2932 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2933 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2934 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2935 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2936
2937 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2938 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2939 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2940 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2941 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2942 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2943 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2944 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2945 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2946 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2947 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
2948 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
2949 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
2950
2951 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
2952 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
2953 };
2954
2955 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2956 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2957 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2958 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2959 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2960 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2961 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2962 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2963 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2964 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2965 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2966 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2967 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2968
2969 // These truncates end up widening elements.
2970 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
2971 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
2972 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
2973
2974 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
2975 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
2976 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
2977
2978 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2979 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2980 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2981 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2982 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2983 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2984 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2985 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2986 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2987 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
2988 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2989
2990 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2991 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2992 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
2993 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
2994 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2995 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2996 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2997 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2998 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
2999 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3000 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
3001 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
3002 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
3003 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
3004
3005 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3006 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
3007 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3008 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
3009 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3010 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3011 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3012 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3013 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
3014 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
3015
3016 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3017 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3018 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3019 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3020 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3021 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3022 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3023 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3024 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3025 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3026 };
3027
3028 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
3029 // These are somewhat magic numbers justified by comparing the
3030 // output of llvm-mca for our various supported scheduler models
3031 // and basing it off the worst case scenario.
3032 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3033 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3034 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
3035 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
3036 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
3037 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3038 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
3039 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3040 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3041 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
3042 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
3043 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
3044
3045 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3046 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3047 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
3048 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
3049 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3050 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
3051 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
3052 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3053 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
3054 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
3055 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
3056 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
3057 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
3058
3059 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3060 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3061 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3062 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3063 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3064 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3065 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3066 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3067 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3068 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3069
3070 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3071 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3072 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3073 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
3074 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3075 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3076 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3077 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3078 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
3079 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
3080
3081 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3082 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3083 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
3084 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
3085 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3086 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
3087 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
3088 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
3089 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3090 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
3091 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3092 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
3093
3094 // These truncates are really widening elements.
3095 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
3096 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
3097 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
3098 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
3099 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
3100 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
3101
3102 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
3103 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
3104 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
3105 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
3106 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
3107 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
3108 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
3109 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
3110 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
3111 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
3112 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
3113 };
3114
3115 static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
3116 { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } },
3117 { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } },
3118 { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } },
3119 { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } },
3120 { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3121 { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } },
3122 { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } },
3123 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3124 };
3125
3126 // Attempt to map directly to (simple) MVT types to let us match custom entries.
3127 EVT SrcTy = TLI->getValueType(DL, Src);
3128 EVT DstTy = TLI->getValueType(DL, Dst);
3129
3130 // The function getSimpleVT only handles simple value types.
3131 if (SrcTy.isSimple() && DstTy.isSimple()) {
3132 MVT SimpleSrcTy = SrcTy.getSimpleVT();
3133 MVT SimpleDstTy = DstTy.getSimpleVT();
3134
3135 if (ST->useAVX512Regs()) {
3136 if (ST->hasBWI())
3137 if (const auto *Entry = ConvertCostTableLookup(
3138 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3139 if (auto KindCost = Entry->Cost[CostKind])
3140 return *KindCost;
3141
3142 if (ST->hasDQI())
3143 if (const auto *Entry = ConvertCostTableLookup(
3144 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3145 if (auto KindCost = Entry->Cost[CostKind])
3146 return *KindCost;
3147
3148 if (ST->hasAVX512())
3149 if (const auto *Entry = ConvertCostTableLookup(
3150 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3151 if (auto KindCost = Entry->Cost[CostKind])
3152 return *KindCost;
3153 }
3154
3155 if (ST->hasBWI())
3156 if (const auto *Entry = ConvertCostTableLookup(
3157 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3158 if (auto KindCost = Entry->Cost[CostKind])
3159 return *KindCost;
3160
3161 if (ST->hasDQI())
3162 if (const auto *Entry = ConvertCostTableLookup(
3163 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3164 if (auto KindCost = Entry->Cost[CostKind])
3165 return *KindCost;
3166
3167 if (ST->hasAVX512())
3168 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3169 SimpleDstTy, SimpleSrcTy))
3170 if (auto KindCost = Entry->Cost[CostKind])
3171 return *KindCost;
3172
3173 if (ST->hasAVX2()) {
3174 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3175 SimpleDstTy, SimpleSrcTy))
3176 if (auto KindCost = Entry->Cost[CostKind])
3177 return *KindCost;
3178 }
3179
3180 if (ST->hasAVX()) {
3181 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3182 SimpleDstTy, SimpleSrcTy))
3183 if (auto KindCost = Entry->Cost[CostKind])
3184 return *KindCost;
3185 }
3186
3187 if (ST->hasF16C()) {
3188 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3189 SimpleDstTy, SimpleSrcTy))
3190 if (auto KindCost = Entry->Cost[CostKind])
3191 return *KindCost;
3192 }
3193
3194 if (ST->hasSSE41()) {
3195 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3196 SimpleDstTy, SimpleSrcTy))
3197 if (auto KindCost = Entry->Cost[CostKind])
3198 return *KindCost;
3199 }
3200
3201 if (ST->hasSSE2()) {
3202 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3203 SimpleDstTy, SimpleSrcTy))
3204 if (auto KindCost = Entry->Cost[CostKind])
3205 return *KindCost;
3206 }
3207
3208 if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) ||
3209 (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) {
3210 // fp16 conversions not covered by any table entries require a libcall.
3211 // Return a large (arbitrary) number to model this.
3212 return InstructionCost(64);
3213 }
3214 }
3215
3216 // Fall back to legalized types.
3217 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3218 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3219
3220 // If we're truncating to the same legalized type - just assume its free.
3221 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3222 return TTI::TCC_Free;
3223
3224 if (ST->useAVX512Regs()) {
3225 if (ST->hasBWI())
3226 if (const auto *Entry = ConvertCostTableLookup(
3227 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3228 if (auto KindCost = Entry->Cost[CostKind])
3229 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3230
3231 if (ST->hasDQI())
3232 if (const auto *Entry = ConvertCostTableLookup(
3233 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3234 if (auto KindCost = Entry->Cost[CostKind])
3235 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3236
3237 if (ST->hasAVX512())
3238 if (const auto *Entry = ConvertCostTableLookup(
3239 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3240 if (auto KindCost = Entry->Cost[CostKind])
3241 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3242 }
3243
3244 if (ST->hasBWI())
3245 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3246 LTDest.second, LTSrc.second))
3247 if (auto KindCost = Entry->Cost[CostKind])
3248 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3249
3250 if (ST->hasDQI())
3251 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3252 LTDest.second, LTSrc.second))
3253 if (auto KindCost = Entry->Cost[CostKind])
3254 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3255
3256 if (ST->hasAVX512())
3257 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3258 LTDest.second, LTSrc.second))
3259 if (auto KindCost = Entry->Cost[CostKind])
3260 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3261
3262 if (ST->hasAVX2())
3263 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3264 LTDest.second, LTSrc.second))
3265 if (auto KindCost = Entry->Cost[CostKind])
3266 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3267
3268 if (ST->hasAVX())
3269 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3270 LTDest.second, LTSrc.second))
3271 if (auto KindCost = Entry->Cost[CostKind])
3272 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3273
3274 if (ST->hasF16C()) {
3275 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3276 LTDest.second, LTSrc.second))
3277 if (auto KindCost = Entry->Cost[CostKind])
3278 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3279 }
3280
3281 if (ST->hasSSE41())
3282 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3283 LTDest.second, LTSrc.second))
3284 if (auto KindCost = Entry->Cost[CostKind])
3285 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3286
3287 if (ST->hasSSE2())
3288 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3289 LTDest.second, LTSrc.second))
3290 if (auto KindCost = Entry->Cost[CostKind])
3291 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3292
3293 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3294 // sitofp.
3295 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3296 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3297 Type *ExtSrc = Src->getWithNewBitWidth(32);
3298 unsigned ExtOpc =
3299 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3300
3301 // For scalar loads the extend would be free.
3302 InstructionCost ExtCost = 0;
3303 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3304 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3305
3306 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3308 }
3309
3310 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3311 // i32.
3312 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3313 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3314 Type *TruncDst = Dst->getWithNewBitWidth(32);
3315 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3316 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3318 }
3319
3320 // TODO: Allow non-throughput costs that aren't binary.
3321 auto AdjustCost = [&CostKind](InstructionCost Cost,
3324 return Cost == 0 ? 0 : N;
3325 return Cost * N;
3326 };
3327 return AdjustCost(
3328 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3329}
3330
3332 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3334 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
3335 // Early out if this type isn't scalar/vector integer/float.
3336 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3337 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3338 Op1Info, Op2Info, I);
3339
3340 // Legalize the type.
3341 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3342
3343 MVT MTy = LT.second;
3344
3345 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3346 assert(ISD && "Invalid opcode");
3347
3348 InstructionCost ExtraCost = 0;
3349 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3350 // Some vector comparison predicates cost extra instructions.
3351 // TODO: Adjust ExtraCost based on CostKind?
3352 // TODO: Should we invert this and assume worst case cmp costs
3353 // and reduce for particular predicates?
3354 if (MTy.isVector() &&
3355 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3356 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3357 ST->hasBWI())) {
3358 // Fallback to I if a specific predicate wasn't specified.
3359 CmpInst::Predicate Pred = VecPred;
3360 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3362 Pred = cast<CmpInst>(I)->getPredicate();
3363
3364 bool CmpWithConstant = false;
3365 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3366 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3367
3368 switch (Pred) {
3370 // xor(cmpeq(x,y),-1)
3371 ExtraCost = CmpWithConstant ? 0 : 1;
3372 break;
3375 // xor(cmpgt(x,y),-1)
3376 ExtraCost = CmpWithConstant ? 0 : 1;
3377 break;
3380 // cmpgt(xor(x,signbit),xor(y,signbit))
3381 // xor(cmpeq(pmaxu(x,y),x),-1)
3382 ExtraCost = CmpWithConstant ? 1 : 2;
3383 break;
3386 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3387 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3388 // cmpeq(psubus(x,y),0)
3389 // cmpeq(pminu(x,y),x)
3390 ExtraCost = 1;
3391 } else {
3392 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3393 ExtraCost = CmpWithConstant ? 2 : 3;
3394 }
3395 break;
3398 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3399 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3400 if (CondTy && !ST->hasAVX())
3401 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3403 Op1Info, Op2Info) +
3404 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3406 Op1Info, Op2Info) +
3407 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3408
3409 break;
3412 // Assume worst case scenario and add the maximum extra cost.
3413 ExtraCost = 3;
3414 break;
3415 default:
3416 break;
3417 }
3418 }
3419 }
3420
3421 static const CostKindTblEntry SLMCostTbl[] = {
3422 // slm pcmpeq/pcmpgt throughput is 2
3423 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3424 // slm pblendvb/blendvpd/blendvps throughput is 4
3425 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3426 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3427 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3428 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3429 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3430 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3431 };
3432
3433 static const CostKindTblEntry AVX512BWCostTbl[] = {
3434 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3435 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3436 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3437 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3438
3439 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3440 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3441 };
3442
3443 static const CostKindTblEntry AVX512CostTbl[] = {
3444 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3445 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3446 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3447 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3448
3449 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3450 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3451 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3452 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3453 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3454 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3455 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3456
3457 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3458 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3459 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3460 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3461 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3462 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3463 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3464 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3465 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3466 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3467 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3468 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3469 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3470 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3471
3472 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3473 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3474 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3475 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3476 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3477 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3478 };
3479
3480 static const CostKindTblEntry AVX2CostTbl[] = {
3481 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3482 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3483 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3484 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3485 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3486 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3487
3488 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3489 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3490 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3491 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3492
3493 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3494 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3495 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3496 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3497 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3498 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3499 };
3500
3501 static const CostKindTblEntry XOPCostTbl[] = {
3502 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3503 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3504 };
3505
3506 static const CostKindTblEntry AVX1CostTbl[] = {
3507 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3508 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3509 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3510 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3511 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3512 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3513
3514 // AVX1 does not support 8-wide integer compare.
3515 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3516 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3517 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3518 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3519
3520 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3521 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3522 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3523 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3524 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3525 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3526 };
3527
3528 static const CostKindTblEntry SSE42CostTbl[] = {
3529 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3530 };
3531
3532 static const CostKindTblEntry SSE41CostTbl[] = {
3533 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3534 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3535
3536 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3537 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3538 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3539 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3540 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3541 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3542 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3543 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3544 };
3545
3546 static const CostKindTblEntry SSE2CostTbl[] = {
3547 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3548 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3549
3550 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3551 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3552 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3553 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3554
3555 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3556 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3557 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3558 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3559 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3560 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3561 };
3562
3563 static const CostKindTblEntry SSE1CostTbl[] = {
3564 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3565 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3566
3567 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3568 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3569 };
3570
3571 if (ST->useSLMArithCosts())
3572 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3573 if (auto KindCost = Entry->Cost[CostKind])
3574 return LT.first * (ExtraCost + *KindCost);
3575
3576 if (ST->hasBWI())
3577 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3578 if (auto KindCost = Entry->Cost[CostKind])
3579 return LT.first * (ExtraCost + *KindCost);
3580
3581 if (ST->hasAVX512())
3582 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3583 if (auto KindCost = Entry->Cost[CostKind])
3584 return LT.first * (ExtraCost + *KindCost);
3585
3586 if (ST->hasAVX2())
3587 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3588 if (auto KindCost = Entry->Cost[CostKind])
3589 return LT.first * (ExtraCost + *KindCost);
3590
3591 if (ST->hasXOP())
3592 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3593 if (auto KindCost = Entry->Cost[CostKind])
3594 return LT.first * (ExtraCost + *KindCost);
3595
3596 if (ST->hasAVX())
3597 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3598 if (auto KindCost = Entry->Cost[CostKind])
3599 return LT.first * (ExtraCost + *KindCost);
3600
3601 if (ST->hasSSE42())
3602 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3603 if (auto KindCost = Entry->Cost[CostKind])
3604 return LT.first * (ExtraCost + *KindCost);
3605
3606 if (ST->hasSSE41())
3607 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3608 if (auto KindCost = Entry->Cost[CostKind])
3609 return LT.first * (ExtraCost + *KindCost);
3610
3611 if (ST->hasSSE2())
3612 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3613 if (auto KindCost = Entry->Cost[CostKind])
3614 return LT.first * (ExtraCost + *KindCost);
3615
3616 if (ST->hasSSE1())
3617 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3618 if (auto KindCost = Entry->Cost[CostKind])
3619 return LT.first * (ExtraCost + *KindCost);
3620
3621 // Assume a 3cy latency for fp select ops.
3622 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3623 if (ValTy->getScalarType()->isFloatingPointTy())
3624 return 3;
3625
3626 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3627 Op1Info, Op2Info, I);
3628}
3629
3631
3635 // Costs should match the codegen from:
3636 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3637 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3638 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3639 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3640 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3641
3642 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3643 // specialized in these tables yet.
3644 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3645 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3646 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3647 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3648 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3649 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3650 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3651 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3652 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3653 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3654 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3655 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3656 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3657 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3658 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3659 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3660 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3661 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3662 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3663 };
3664 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3665 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3666 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3667 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3668 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3669 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3670 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3671 };
3672 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3673 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3674 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3675 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3676 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3677 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3678 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3679 };
3680 static const CostKindTblEntry AVX512CDCostTbl[] = {
3681 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3682 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3683 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3684 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3685 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3686 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3687 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3688 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3689 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3690 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3691 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3692 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3693
3694 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3695 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3696 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3697 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3698 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3699 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3700 };
3701 static const CostKindTblEntry AVX512BWCostTbl[] = {
3702 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3703 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3704 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3705 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3706 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3707 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3708 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3709 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3710 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3711 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3712 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3713 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3714 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3715 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3716 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3717 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3718 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3719 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3720 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3721 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3722 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3723 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3724 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3725 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3726 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3727 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3728 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3729 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3730 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3731 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3732 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3733 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3734 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3735 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3736 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3737 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3738 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3739 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3740 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3741 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3742 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3743 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3744 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3745 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3746 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3747 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3748 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3749 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3750 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3751 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3752 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3753 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3754 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3755 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3756 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3757 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3758 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3759 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3760 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3761 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3762 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3763 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3764 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3765 { ISD::SADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3766 { ISD::SADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3767 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3768 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3769 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3770 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3771 { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } },
3772 { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } },
3773 { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } },
3774 { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } },
3775 { ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3776 { ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3777 { ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3778 { ISD::UADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3779 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3780 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3781 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3782 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3783 { ISD::USUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3784 { ISD::USUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3785 };
3786 static const CostKindTblEntry AVX512CostTbl[] = {
3787 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3788 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3789 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3790 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3791 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3792 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3793 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3794 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3795 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3796 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3797 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3798 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3799 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3800 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3801 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3802 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3803 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3804 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3805 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3806 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3807 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3808 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3809 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3810 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3811 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3812 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3813 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3814 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3815 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3816 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3817 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3818 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3819 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3820 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3821 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3822 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3823 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3824 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3825 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3826 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3827 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3828 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3829 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3830 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3831 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3832 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3833 { ISD::SADDSAT, MVT::v2i64, { 3, 3, 8, 9 } },
3834 { ISD::SADDSAT, MVT::v4i64, { 2, 2, 6, 7 } },
3835 { ISD::SADDSAT, MVT::v8i64, { 3, 3, 6, 7 } },
3836 { ISD::SADDSAT, MVT::v4i32, { 2, 2, 6, 7 } },
3837 { ISD::SADDSAT, MVT::v8i32, { 2, 2, 6, 7 } },
3838 { ISD::SADDSAT, MVT::v16i32, { 3, 3, 6, 7 } },
3839 { ISD::SADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3840 { ISD::SADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3841 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3842 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3843 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3844 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3845 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3846 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3847 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3848 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3849 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3850 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3851 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3852 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3853 { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } },
3854 { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } },
3855 { ISD::SMULO, MVT::v32i16, { 6, 12, 17, 17 } },
3856 { ISD::SMULO, MVT::v64i8, { 22, 28, 42, 42 } },
3857 { ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } },
3858 { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } },
3859 { ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } },
3860 { ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } },
3861 { ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } },
3862 { ISD::SSUBSAT, MVT::v16i32, { 2, 14, 7, 8 } },
3863 { ISD::SSUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3864 { ISD::SSUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3865 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3866 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3867 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3868 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3869 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3870 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3871 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3872 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3873 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3874 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3875 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3876 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3877 { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} },
3878 { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } },
3879 { ISD::UMULO, MVT::v32i16, { 5, 13, 16, 16 } },
3880 { ISD::UMULO, MVT::v64i8, { 18, 24, 30, 30 } },
3881 { ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } },
3882 { ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } },
3883 { ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } },
3884 { ISD::UADDSAT, MVT::v4i32, { 1, 2, 4, 4 } },
3885 { ISD::UADDSAT, MVT::v8i32, { 1, 2, 4, 4 } },
3886 { ISD::UADDSAT, MVT::v16i32, { 2, 2, 4, 4 } },
3887 { ISD::UADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3888 { ISD::UADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3889 { ISD::USUBSAT, MVT::v2i64, { 1, 4, 2, 2 } },
3890 { ISD::USUBSAT, MVT::v4i64, { 1, 4, 2, 2 } },
3891 { ISD::USUBSAT, MVT::v8i64, { 1, 4, 2, 2 } },
3892 { ISD::USUBSAT, MVT::v8i32, { 1, 2, 2, 2 } },
3893 { ISD::USUBSAT, MVT::v16i32, { 1, 2, 2, 2 } },
3894 { ISD::USUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3895 { ISD::USUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3896 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3897 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3898 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3899 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3900 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3901 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3902 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3903 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3904 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3905 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3906 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3907 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3908 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3909 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3910 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3911 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3912 };
3913 static const CostKindTblEntry XOPCostTbl[] = {
3914 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3915 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3916 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3917 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3918 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3919 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3920 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3921 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3922 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3923 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3924 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3925 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3926 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3927 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3928 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3929 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3930 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3931 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3932 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3933 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3934 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3935 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3936 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3937 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3938 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3939 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3940 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3941 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3942 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3943 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3944 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3945 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3946 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3947 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3948 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3949 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3950 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3951 };
3952 static const CostKindTblEntry AVX2CostTbl[] = {
3953 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3954 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3955 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3956 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3957 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3958 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3959 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3960 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3961 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3962 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3963 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3964 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3965 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3966 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3967 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3968 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3969 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3970 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3971 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3972 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3973 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3974 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3975 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3976 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3977 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3978 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3979 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3980 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3981 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3982 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3983 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3984 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3985 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3986 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3987 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3988 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3989 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3990 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3991 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3992 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3993 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3994 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3995 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3996 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3997 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
3998 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
3999 { ISD::SADDSAT, MVT::v2i64, { 4, 13, 8, 11 } },
4000 { ISD::SADDSAT, MVT::v4i64, { 3, 10, 8, 12 } },
4001 { ISD::SADDSAT, MVT::v4i32, { 2, 6, 7, 9 } },
4002 { ISD::SADDSAT, MVT::v8i32, { 4, 6, 7, 13 } },
4003 { ISD::SADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4004 { ISD::SADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4005 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
4006 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
4007 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4008 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4009 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4010 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
4011 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
4012 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4013 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4014 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4015 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4016 { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } },
4017 { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } },
4018 { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } },
4019 { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } },
4020 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4021 { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } },
4022 { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } },
4023 { ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } },
4024 { ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } },
4025 { ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } },
4026 { ISD::SSUBSAT, MVT::v8i32, { 4, 15, 9, 16 } },
4027 { ISD::SSUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4028 { ISD::SSUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4029 { ISD::UADDSAT, MVT::v2i64, { 2, 8, 6, 6 } },
4030 { ISD::UADDSAT, MVT::v4i64, { 3, 8, 6, 10 } },
4031 { ISD::UADDSAT, MVT::v8i32, { 2, 2, 4, 8 } },
4032 { ISD::UADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4033 { ISD::UADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4034 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
4035 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
4036 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4037 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4038 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4039 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
4040 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
4041 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4042 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4043 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4044 { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } },
4045 { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } },
4046 { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } },
4047 { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } },
4048 { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } },
4049 { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } },
4050 { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } },
4051 { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } },
4052 { ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } },
4053 { ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } },
4054 { ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } },
4055 { ISD::USUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4056 { ISD::USUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4057 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4058 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4059 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4060 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4061 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4062 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4063 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
4064 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
4065 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
4066 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
4067 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
4068 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
4069 };
4070 static const CostKindTblEntry AVX1CostTbl[] = {
4071 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4072 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
4073 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
4074 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
4075 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4076 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
4077 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4078 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
4079 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4080 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
4081 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
4082 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
4083 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
4084 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
4085 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
4086 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
4087 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
4088 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
4089 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
4090 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
4091 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
4092 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
4093 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
4094 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
4095 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4096 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
4097 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4098 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
4099 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4100 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
4101 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
4102 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
4103 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
4104 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
4105 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
4106 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
4107 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
4108 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
4109 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4110 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
4111 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
4112 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
4113 { ISD::SADDSAT, MVT::v2i64, { 6, 13, 8, 11 } },
4114 { ISD::SADDSAT, MVT::v4i64, { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
4115 { ISD::SADDSAT, MVT::v8i32, { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
4116 { ISD::SADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4117 { ISD::SADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4118 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4119 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
4120 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4121 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4122 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4123 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4124 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4125 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4126 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4127 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4128 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4129 { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } },
4130 { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } },
4131 { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } },
4132 { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } },
4133 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4134 { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } },
4135 { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } },
4136 { ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } },
4137 { ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4138 { ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4139 { ISD::SSUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4140 { ISD::SSUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4141 { ISD::UADDSAT, MVT::v2i64, { 3, 8, 6, 6 } },
4142 { ISD::UADDSAT, MVT::v4i64, { 8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4143 { ISD::UADDSAT, MVT::v8i32, { 6, 6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
4144 { ISD::UADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4145 { ISD::UADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4146 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4147 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
4148 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4149 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4150 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4151 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4152 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
4153 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4154 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4155 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4156 { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } },
4157 { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } },
4158 { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } },
4159 { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } },
4160 { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } },
4161 { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } },
4162 { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } },
4163 { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } },
4164 { ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } },
4165 { ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4166 { ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert
4167 { ISD::USUBSAT, MVT::v8i32, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4168 { ISD::USUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4169 { ISD::USUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4170 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4171 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4172 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4173 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4174 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4175 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4176 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
4177 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
4178 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
4179 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
4180 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
4181 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
4182 };
4183 static const CostKindTblEntry GFNICostTbl[] = {
4184 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
4185 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
4186 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
4187 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
4188 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4189 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4190 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4191 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
4192 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4193 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4194 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
4195 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4196 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4197 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
4198 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4199 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4200 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4201 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4202 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4203 };
4204 static const CostKindTblEntry GLMCostTbl[] = {
4205 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
4206 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
4207 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
4208 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
4209 };
4210 static const CostKindTblEntry SLMCostTbl[] = {
4211 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
4212 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
4213 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
4214 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
4215 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
4216 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
4217 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
4218 };
4219 static const CostKindTblEntry SSE42CostTbl[] = {
4220 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4221 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4222 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4223 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4224 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4225 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4226 };
4227 static const CostKindTblEntry SSE41CostTbl[] = {
4228 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
4229 { ISD::SADDSAT, MVT::v2i64, { 10, 14, 17, 21 } },
4230 { ISD::SADDSAT, MVT::v4i32, { 5, 11, 8, 10 } },
4231 { ISD::SSUBSAT, MVT::v2i64, { 12, 19, 25, 29 } },
4232 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 10, 12 } },
4233 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
4234 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4235 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4236 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4237 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4238 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4239 { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } },
4240 { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } },
4241 { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } },
4242 { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } },
4243 { ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } },
4244 { ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } },
4245 { ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } },
4246 { ISD::USUBSAT, MVT::v4i32, { 1, 2, 2, 2 } },
4247 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
4248 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4249 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4250 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
4251 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4252 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4253 { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } },
4254 { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } },
4255 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4256 { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } },
4257 };
4258 static const CostKindTblEntry SSSE3CostTbl[] = {
4259 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
4260 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
4261 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
4262 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
4263 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
4264 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
4265 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
4266 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
4267 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
4268 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
4269 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4270 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4271 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4272 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4273 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4274 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4275 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4276 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4277 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4278 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4279 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4280 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4281 };
4282 static const CostKindTblEntry SSE2CostTbl[] = {
4283 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4284 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4285 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4286 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4287 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4288 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4289 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4290 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4291 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4292 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4293 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4294 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4295 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4296 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4297 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4298 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4299 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4300 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4301 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4302 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4303 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4304 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4305 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4306 { ISD::SADDSAT, MVT::v2i64, { 12, 14, 24, 24 } },
4307 { ISD::SADDSAT, MVT::v4i32, { 6, 11, 11, 12 } },
4308 { ISD::SADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4309 { ISD::SADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4310 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4311 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4312 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4313 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4314 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4315 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4316 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4317 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4318 { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } },
4319 { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } },
4320 { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } },
4321 { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } },
4322 { ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } },
4323 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } },
4324 { ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4325 { ISD::SSUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4326 { ISD::UADDSAT, MVT::v2i64, { 7, 13, 14, 14 } },
4327 { ISD::UADDSAT, MVT::v4i32, { 4, 5, 7, 7 } },
4328 { ISD::UADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4329 { ISD::UADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4330 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4331 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4332 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4333 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4334 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4335 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4336 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4337 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4338 { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } },
4339 { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } },
4340 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4341 { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } },
4342 { ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } },
4343 { ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } },
4344 { ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4345 { ISD::USUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4346 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4347 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4348 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4349 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4350 };
4351 static const CostKindTblEntry SSE1CostTbl[] = {
4352 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4353 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4354 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4355 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4356 };
4357 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4358 { ISD::CTTZ, MVT::i64, { 1, 1, 1, 1 } },
4359 };
4360 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4361 { ISD::CTTZ, MVT::i32, { 1, 1, 1, 1 } },
4362 { ISD::CTTZ, MVT::i16, { 2, 1, 1, 1 } },
4363 { ISD::CTTZ, MVT::i8, { 2, 1, 1, 1 } },
4364 };
4365 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4366 { ISD::CTLZ, MVT::i64, { 1, 1, 1, 1 } },
4367 };
4368 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4369 { ISD::CTLZ, MVT::i32, { 1, 1, 1, 1 } },
4370 { ISD::CTLZ, MVT::i16, { 2, 1, 1, 1 } },
4371 { ISD::CTLZ, MVT::i8, { 2, 1, 1, 1 } },
4372 };
4373 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4374 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4375 };
4376 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4377 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4378 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4379 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4380 };
4381 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4382 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4383 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4384 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4385 { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4386 { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4387 { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR
4388 { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR
4389 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
4390 { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF
4391 { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF
4392 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF
4393 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF
4394 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
4395 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4396 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4397 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4398 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4399 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4400 { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } },
4401 { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } },
4402 { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } },
4403 { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } },
4404 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4405 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4406 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4407 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4408 { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
4409 { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
4410 { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
4411 { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
4412 };
4413 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4414 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4415 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4416 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4417 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4418 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4419 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4420 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4421 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4422 { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4423 { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4424 { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4425 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
4426 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
4427 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4428 { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH
4429 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4430 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4431 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 1, 2 } }, // BSF
4432 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 2 } }, // BSF
4433 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 2 } }, // BSF
4434 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4435 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4436 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4437 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4438 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4439 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4440 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4441 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4442 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4443 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4444 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4445 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4446 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4447 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4448 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4449 { ISD::SADDSAT, MVT::i32, { 3, 4, 6, 9 } },
4450 { ISD::SADDSAT, MVT::i16, { 4, 4, 7, 10 } },
4451 { ISD::SADDSAT, MVT::i8, { 4, 5, 8, 11 } },
4452 { ISD::SSUBSAT, MVT::i32, { 4, 4, 7, 10 } },
4453 { ISD::SSUBSAT, MVT::i16, { 4, 4, 7, 10 } },
4454 { ISD::SSUBSAT, MVT::i8, { 4, 5, 8, 11 } },
4455 { ISD::UADDSAT, MVT::i32, { 2, 3, 4, 7 } },
4456 { ISD::UADDSAT, MVT::i16, { 2, 3, 4, 7 } },
4457 { ISD::UADDSAT, MVT::i8, { 3, 3, 5, 8 } },
4458 { ISD::USUBSAT, MVT::i32, { 2, 3, 4, 7 } },
4459 { ISD::USUBSAT, MVT::i16, { 2, 3, 4, 7 } },
4460 { ISD::USUBSAT, MVT::i8, { 3, 3, 5, 8 } },
4461 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4462 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4463 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4464 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4465 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4466 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4467 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4468 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4469 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4470 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4471 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4472 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4473 { ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } },
4474 { ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } },
4475 { ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } },
4476 { ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } },
4477 { ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } },
4478 { ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } },
4479 { ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } },
4480 { ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } },
4481 { ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } },
4482 { ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } },
4483 { ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } },
4484 { ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } },
4485 };
4486
4487 Type *RetTy = ICA.getReturnType();
4488 Type *OpTy = RetTy;
4489 Intrinsic::ID IID = ICA.getID();
4490 unsigned ISD = ISD::DELETED_NODE;
4491 switch (IID) {
4492 default:
4493 break;
4494 case Intrinsic::abs:
4495 ISD = ISD::ABS;
4496 break;
4497 case Intrinsic::bitreverse:
4498 ISD = ISD::BITREVERSE;
4499 break;
4500 case Intrinsic::bswap:
4501 ISD = ISD::BSWAP;
4502 break;
4503 case Intrinsic::ctlz:
4504 ISD = ISD::CTLZ;
4505 break;
4506 case Intrinsic::ctpop:
4507 ISD = ISD::CTPOP;
4508 break;
4509 case Intrinsic::cttz:
4510 ISD = ISD::CTTZ;
4511 break;
4512 case Intrinsic::fshl:
4513 ISD = ISD::FSHL;
4514 if (!ICA.isTypeBasedOnly()) {
4515 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4516 if (Args[0] == Args[1]) {
4517 ISD = ISD::ROTL;
4518 // Handle uniform constant rotation amounts.
4519 // TODO: Handle funnel-shift cases.
4520 const APInt *Amt;
4521 if (Args[2] &&
4523 ISD = X86ISD::VROTLI;
4524 }
4525 }
4526 break;
4527 case Intrinsic::fshr:
4528 // FSHR has same costs so don't duplicate.
4529 ISD = ISD::FSHL;
4530 if (!ICA.isTypeBasedOnly()) {
4531 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4532 if (Args[0] == Args[1]) {
4533 ISD = ISD::ROTR;
4534 // Handle uniform constant rotation amount.
4535 // TODO: Handle funnel-shift cases.
4536 const APInt *Amt;
4537 if (Args[2] &&
4539 ISD = X86ISD::VROTLI;
4540 }
4541 }
4542 break;
4543 case Intrinsic::lrint:
4544 case Intrinsic::llrint: {
4545 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4546 // have the same costs as the CVTTP2SI (fptosi) instructions
4547 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4548 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4550 }
4551 case Intrinsic::maxnum:
4552 case Intrinsic::minnum:
4553 // FMINNUM has same costs so don't duplicate.
4554 ISD = ISD::FMAXNUM;
4555 break;
4556 case Intrinsic::sadd_sat:
4557 ISD = ISD::SADDSAT;
4558 break;
4559 case Intrinsic::smax:
4560 ISD = ISD::SMAX;
4561 break;
4562 case Intrinsic::smin:
4563 ISD = ISD::SMIN;
4564 break;
4565 case Intrinsic::ssub_sat:
4566 ISD = ISD::SSUBSAT;
4567 break;
4568 case Intrinsic::uadd_sat:
4569 ISD = ISD::UADDSAT;
4570 break;
4571 case Intrinsic::umax:
4572 ISD = ISD::UMAX;
4573 break;
4574 case Intrinsic::umin:
4575 ISD = ISD::UMIN;
4576 break;
4577 case Intrinsic::usub_sat:
4578 ISD = ISD::USUBSAT;
4579 break;
4580 case Intrinsic::sqrt:
4581 ISD = ISD::FSQRT;
4582 break;
4583 case Intrinsic::sadd_with_overflow:
4584 case Intrinsic::ssub_with_overflow:
4585 // SSUBO has same costs so don't duplicate.
4586 ISD = ISD::SADDO;
4587 OpTy = RetTy->getContainedType(0);
4588 break;
4589 case Intrinsic::uadd_with_overflow:
4590 case Intrinsic::usub_with_overflow:
4591 // USUBO has same costs so don't duplicate.
4592 ISD = ISD::UADDO;
4593 OpTy = RetTy->getContainedType(0);
4594 break;
4595 case Intrinsic::smul_with_overflow:
4596 ISD = ISD::SMULO;
4597 OpTy = RetTy->getContainedType(0);
4598 break;
4599 case Intrinsic::umul_with_overflow:
4600 ISD = ISD::UMULO;
4601 OpTy = RetTy->getContainedType(0);
4602 break;
4603 }
4604
4605 if (ISD != ISD::DELETED_NODE) {
4606 auto adjustTableCost = [&](int ISD, unsigned Cost,
4607 std::pair<InstructionCost, MVT> LT,
4609 InstructionCost LegalizationCost = LT.first;
4610 MVT MTy = LT.second;
4611
4612 // If there are no NANs to deal with, then these are reduced to a
4613 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4614 // assume is used in the non-fast case.
4615 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4616 if (FMF.noNaNs())
4617 return LegalizationCost * 1;
4618 }
4619
4620 // For cases where some ops can be folded into a load/store, assume free.
4621 if (MTy.isScalarInteger()) {
4622 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4623 if (const Instruction *II = ICA.getInst()) {
4624 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4625 return TTI::TCC_Free;
4626 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4627 if (LI->hasOneUse())
4628 return TTI::TCC_Free;
4629 }
4630 }
4631 }
4632 }
4633
4634 return LegalizationCost * (int)Cost;
4635 };
4636
4637 // Legalize the type.
4638 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4639 MVT MTy = LT.second;
4640
4641 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4642 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4643 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4644 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4645 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4646 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4647 if (Cst->isAllOnesValue())
4649 }
4650
4651 // FSQRT is a single instruction.
4652 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4653 return LT.first;
4654
4655 if (ST->useGLMDivSqrtCosts())
4656 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4657 if (auto KindCost = Entry->Cost[CostKind])
4658 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4659
4660 if (ST->useSLMArithCosts())
4661 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4662 if (auto KindCost = Entry->Cost[CostKind])
4663 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4664
4665 if (ST->hasVBMI2())
4666 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4667 if (auto KindCost = Entry->Cost[CostKind])
4668 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4669
4670 if (ST->hasBITALG())
4671 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4672 if (auto KindCost = Entry->Cost[CostKind])
4673 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4674
4675 if (ST->hasVPOPCNTDQ())
4676 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4677 if (auto KindCost = Entry->Cost[CostKind])
4678 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4679
4680 if (ST->hasGFNI())
4681 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4682 if (auto KindCost = Entry->Cost[CostKind])
4683 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4684
4685 if (ST->hasCDI())
4686 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4687 if (auto KindCost = Entry->Cost[CostKind])
4688 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4689
4690 if (ST->hasBWI())
4691 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4692 if (auto KindCost = Entry->Cost[CostKind])
4693 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4694
4695 if (ST->hasAVX512())
4696 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4697 if (auto KindCost = Entry->Cost[CostKind])
4698 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4699
4700 if (ST->hasXOP())
4701 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4702 if (auto KindCost = Entry->Cost[CostKind])
4703 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4704
4705 if (ST->hasAVX2())
4706 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4707 if (auto KindCost = Entry->Cost[CostKind])
4708 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4709
4710 if (ST->hasAVX())
4711 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4712 if (auto KindCost = Entry->Cost[CostKind])
4713 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4714
4715 if (ST->hasSSE42())
4716 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4717 if (auto KindCost = Entry->Cost[CostKind])
4718 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4719
4720 if (ST->hasSSE41())
4721 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4722 if (auto KindCost = Entry->Cost[CostKind])
4723 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4724
4725 if (ST->hasSSSE3())
4726 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4727 if (auto KindCost = Entry->Cost[CostKind])
4728 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4729
4730 if (ST->hasSSE2())
4731 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4732 if (auto KindCost = Entry->Cost[CostKind])
4733 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4734
4735 if (ST->hasSSE1())
4736 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4737 if (auto KindCost = Entry->Cost[CostKind])
4738 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4739
4740 if (ST->hasBMI()) {
4741 if (ST->is64Bit())
4742 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4743 if (auto KindCost = Entry->Cost[CostKind])
4744 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4745
4746 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4747 if (auto KindCost = Entry->Cost[CostKind])
4748 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4749 }
4750
4751 if (ST->hasLZCNT()) {
4752 if (ST->is64Bit())
4753 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4754 if (auto KindCost = Entry->Cost[CostKind])
4755 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4756
4757 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4758 if (auto KindCost = Entry->Cost[CostKind])
4759 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4760 }
4761
4762 if (ST->hasPOPCNT()) {
4763 if (ST->is64Bit())
4764 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4765 if (auto KindCost = Entry->Cost[CostKind])
4766 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4767
4768 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4769 if (auto KindCost = Entry->Cost[CostKind])
4770 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4771 }
4772
4773 if (ST->is64Bit())
4774 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4775 if (auto KindCost = Entry->Cost[CostKind])
4776 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4777
4778 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4779 if (auto KindCost = Entry->Cost[CostKind])
4780 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4781
4782 // Without arg data, we need to compute the expanded costs of custom lowered
4783 // intrinsics to prevent use of the (very low) default costs.
4784 if (ICA.isTypeBasedOnly() &&
4785 (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) {
4786 Type *CondTy = RetTy->getWithNewBitWidth(1);
4788 Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
4789 Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
4790 Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
4791 Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind);
4792 Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
4793 Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
4795 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
4797 return Cost;
4798 }
4799 }
4800
4802}
4803
4806 unsigned Index, const Value *Op0,
4807 const Value *Op1) const {
4808 static const CostTblEntry SLMCostTbl[] = {
4809 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4810 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4811 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4812 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4813 };
4814
4815 assert(Val->isVectorTy() && "This must be a vector type");
4816 Type *ScalarType = Val->getScalarType();
4817 InstructionCost RegisterFileMoveCost = 0;
4818
4819 // Non-immediate extraction/insertion can be handled as a sequence of
4820 // aliased loads+stores via the stack.
4821 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4822 Opcode == Instruction::InsertElement)) {
4823 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4824 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4825
4826 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4827 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4828 Align VecAlign = DL.getPrefTypeAlign(Val);
4829 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4830
4831 // Extract - store vector to stack, load scalar.
4832 if (Opcode == Instruction::ExtractElement) {
4833 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4834 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4835 CostKind);
4836 }
4837 // Insert - store vector to stack, store scalar, load vector.
4838 if (Opcode == Instruction::InsertElement) {
4839 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4840 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4841 CostKind) +
4842 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4843 }
4844 }
4845
4846 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4847 Opcode == Instruction::InsertElement)) {
4848 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4849 if (Opcode == Instruction::ExtractElement &&
4850 ScalarType->getScalarSizeInBits() == 1 &&
4851 cast<FixedVectorType>(Val)->getNumElements() > 1)
4852 return 1;
4853
4854 // Legalize the type.
4855 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4856
4857 // This type is legalized to a scalar type.
4858 if (!LT.second.isVector())
4859 return TTI::TCC_Free;
4860
4861 // The type may be split. Normalize the index to the new type.
4862 unsigned SizeInBits = LT.second.getSizeInBits();
4863 unsigned NumElts = LT.second.getVectorNumElements();
4864 unsigned SubNumElts = NumElts;
4865 Index = Index % NumElts;
4866
4867 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4868 // For inserts, we also need to insert the subvector back.
4869 if (SizeInBits > 128) {
4870 assert((SizeInBits % 128) == 0 && "Illegal vector");
4871 unsigned NumSubVecs = SizeInBits / 128;
4872 SubNumElts = NumElts / NumSubVecs;
4873 if (SubNumElts <= Index) {
4874 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4875 Index %= SubNumElts;
4876 }
4877 }
4878
4879 MVT MScalarTy = LT.second.getScalarType();
4880 auto IsCheapPInsrPExtrInsertPS = [&]() {
4881 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4882 // Inserting f32 into index0 is just movss.
4883 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4884 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4885 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4886 (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 &&
4887 Opcode == Instruction::InsertElement) ||
4888 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4889 Opcode == Instruction::InsertElement);
4890 };
4891
4892 if (Index == 0) {
4893 // Floating point scalars are already located in index #0.
4894 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4895 // true for all.
4896 if (ScalarType->isFloatingPointTy() &&
4897 (Opcode != Instruction::InsertElement || !Op0 ||
4898 isa<UndefValue>(Op0)))
4899 return RegisterFileMoveCost;
4900
4901 if (Opcode == Instruction::InsertElement &&
4902 isa_and_nonnull<UndefValue>(Op0)) {
4903 // Consider the gather cost to be cheap.
4904 if (isa_and_nonnull<LoadInst>(Op1))
4905 return RegisterFileMoveCost;
4906 if (!IsCheapPInsrPExtrInsertPS()) {
4907 // mov constant-to-GPR + movd/movq GPR -> XMM.
4908 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4909 return 2 + RegisterFileMoveCost;
4910 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4911 return 1 + RegisterFileMoveCost;
4912 }
4913 }
4914
4915 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4916 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4917 return 1 + RegisterFileMoveCost;
4918 }
4919
4920 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4921 assert(ISD && "Unexpected vector opcode");
4922 if (ST->useSLMArithCosts())
4923 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4924 return Entry->Cost + RegisterFileMoveCost;
4925
4926 // Consider cheap cases.
4927 if (IsCheapPInsrPExtrInsertPS())
4928 return 1 + RegisterFileMoveCost;
4929
4930 // For extractions we just need to shuffle the element to index 0, which
4931 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4932 // the elements to its destination. In both cases we must handle the
4933 // subvector move(s).
4934 // If the vector type is already less than 128-bits then don't reduce it.
4935 // TODO: Under what circumstances should we shuffle using the full width?
4936 InstructionCost ShuffleCost = 1;
4937 if (Opcode == Instruction::InsertElement) {
4938 auto *SubTy = cast<VectorType>(Val);
4939 EVT VT = TLI->getValueType(DL, Val);
4940 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4941 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4942 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, SubTy, {},
4943 CostKind, 0, SubTy);
4944 }
4945 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4946 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4947 }
4948
4949 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4950 RegisterFileMoveCost;
4951}
4952
4954 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4955 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
4956 ArrayRef<Value *> VL) const {
4957 assert(DemandedElts.getBitWidth() ==
4958 cast<FixedVectorType>(Ty)->getNumElements() &&
4959 "Vector size mismatch");
4960
4961 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4962 MVT MScalarTy = LT.second.getScalarType();
4963 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4965
4966 constexpr unsigned LaneBitWidth = 128;
4967 assert((LegalVectorBitWidth < LaneBitWidth ||
4968 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4969 "Illegal vector");
4970
4971 const int NumLegalVectors = LT.first.getValue();
4972 assert(NumLegalVectors >= 0 && "Negative cost!");
4973
4974 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4975 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. SLPVectorizer has
4976 // a special heuristic regarding poison input which is passed here in
4977 // ForPoisonSrc.
4978 if (Insert && !ForPoisonSrc) {
4979 // This is nearly identical to BaseT::getScalarizationOverhead(), except
4980 // it is passing nullptr to getVectorInstrCost() for Op0 (instead of
4981 // Constant::getNullValue()), which makes the X86TTIImpl
4982 // getVectorInstrCost() return 0 instead of 1.
4983 for (unsigned I : seq(DemandedElts.getBitWidth())) {
4984 if (!DemandedElts[I])
4985 continue;
4986 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, I,
4988 VL.empty() ? nullptr : VL[I]);
4989 }
4990 return Cost;
4991 }
4992
4993 if (Insert) {
4994 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4995 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4996 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4997 // For types we can insert directly, insertion into 128-bit sub vectors is
4998 // cheap, followed by a cheap chain of concatenations.
4999 if (LegalVectorBitWidth <= LaneBitWidth) {
5000 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
5001 /*Extract*/ false, CostKind);
5002 } else {
5003 // In each 128-lane, if at least one index is demanded but not all
5004 // indices are demanded and this 128-lane is not the first 128-lane of
5005 // the legalized-vector, then this 128-lane needs a extracti128; If in
5006 // each 128-lane, there is at least one demanded index, this 128-lane
5007 // needs a inserti128.
5008
5009 // The following cases will help you build a better understanding:
5010 // Assume we insert several elements into a v8i32 vector in avx2,
5011 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
5012 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
5013 // inserti128.
5014 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
5015 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
5016 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5017 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5018 unsigned NumLegalElts =
5019 LT.second.getVectorNumElements() * NumLegalVectors;
5020 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5021 "Vector has been legalized to smaller element count");
5022 assert((NumLegalElts % NumLanesTotal) == 0 &&
5023 "Unexpected elts per lane");
5024 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5025
5026 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5027 auto *LaneTy =
5028 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5029
5030 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5031 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5032 NumEltsPerLane, NumEltsPerLane * I);
5033 if (LaneEltMask.isZero())
5034 continue;
5035 // FIXME: we don't need to extract if all non-demanded elements
5036 // are legalization-inserted padding.
5037 if (!LaneEltMask.isAllOnes())
5039 CostKind, I * NumEltsPerLane, LaneTy);
5040 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
5041 /*Extract*/ false, CostKind);
5042 }
5043
5044 APInt AffectedLanes =
5045 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
5046 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
5047 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
5048 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
5049 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
5050 unsigned I = NumLegalLanes * LegalVec + Lane;
5051 // No need to insert unaffected lane; or lane 0 of each legal vector
5052 // iff ALL lanes of that vector were affected and will be inserted.
5053 if (!AffectedLanes[I] ||
5054 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
5055 continue;
5057 CostKind, I * NumEltsPerLane, LaneTy);
5058 }
5059 }
5060 }
5061 } else if (LT.second.isVector()) {
5062 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
5063 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
5064 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
5065 // considered cheap.
5066 if (Ty->isIntOrIntVectorTy())
5067 Cost += DemandedElts.popcount();
5068
5069 // Get the smaller of the legalized or original pow2-extended number of
5070 // vector elements, which represents the number of unpacks we'll end up
5071 // performing.
5072 unsigned NumElts = LT.second.getVectorNumElements();
5073 unsigned Pow2Elts =
5074 PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
5075 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
5076 }
5077 }
5078
5079 if (Extract) {
5080 // vXi1 can be efficiently extracted with MOVMSK.
5081 // TODO: AVX512 predicate mask handling.
5082 // NOTE: This doesn't work well for roundtrip scalarization.
5083 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
5084 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
5085 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
5086 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
5087 return MOVMSKCost;
5088 }
5089
5090 if (LT.second.isVector()) {
5091 unsigned NumLegalElts =
5092 LT.second.getVectorNumElements() * NumLegalVectors;
5093 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5094 "Vector has been legalized to smaller element count");
5095
5096 // If we're extracting elements from a 128-bit subvector lane,
5097 // we only need to extract each lane once, not for every element.
5098 if (LegalVectorBitWidth > LaneBitWidth) {
5099 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5100 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5101 assert((NumLegalElts % NumLanesTotal) == 0 &&
5102 "Unexpected elts per lane");
5103 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5104
5105 // Add cost for each demanded 128-bit subvector extraction.
5106 // Luckily this is a lot easier than for insertion.
5107 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5108 auto *LaneTy =
5109 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5110
5111 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5112 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5113 NumEltsPerLane, I * NumEltsPerLane);
5114 if (LaneEltMask.isZero())
5115 continue;
5117 I * NumEltsPerLane, LaneTy);
5119 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
5120 }
5121
5122 return Cost;
5123 }
5124 }
5125
5126 // Fallback to default extraction.
5127 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
5128 Extract, CostKind);
5129 }
5130
5131 return Cost;
5132}
5133
5135X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
5136 int VF, const APInt &DemandedDstElts,
5138 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
5139 // We don't differentiate element types here, only element bit width.
5140 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
5141
5142 auto bailout = [&]() {
5143 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
5144 DemandedDstElts, CostKind);
5145 };
5146
5147 // For now, only deal with AVX512 cases.
5148 if (!ST->hasAVX512())
5149 return bailout();
5150
5151 // Do we have a native shuffle for this element type, or should we promote?
5152 unsigned PromEltTyBits = EltTyBits;
5153 switch (EltTyBits) {
5154 case 32:
5155 case 64:
5156 break; // AVX512F.
5157 case 16:
5158 if (!ST->hasBWI())
5159 PromEltTyBits = 32; // promote to i32, AVX512F.
5160 break; // AVX512BW
5161 case 8:
5162 if (!ST->hasVBMI())
5163 PromEltTyBits = 32; // promote to i32, AVX512F.
5164 break; // AVX512VBMI
5165 case 1:
5166 // There is no support for shuffling i1 elements. We *must* promote.
5167 if (ST->hasBWI()) {
5168 if (ST->hasVBMI())
5169 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
5170 else
5171 PromEltTyBits = 16; // promote to i16, AVX512BW.
5172 break;
5173 }
5174 PromEltTyBits = 32; // promote to i32, AVX512F.
5175 break;
5176 default:
5177 return bailout();
5178 }
5179 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
5180
5181 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
5182 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
5183
5184 int NumDstElements = VF * ReplicationFactor;
5185 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
5186 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
5187
5188 // Legalize the types.
5189 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
5190 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
5191 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
5192 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
5193 // They should have legalized into vector types.
5194 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
5195 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
5196 return bailout();
5197
5198 if (PromEltTyBits != EltTyBits) {
5199 // If we have to perform the shuffle with wider elt type than our data type,
5200 // then we will first need to anyext (we don't care about the new bits)
5201 // the source elements, and then truncate Dst elements.
5202 InstructionCost PromotionCost;
5203 PromotionCost += getCastInstrCost(
5204 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
5206 PromotionCost +=
5207 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
5208 /*Src=*/PromDstVecTy,
5210 return PromotionCost + getReplicationShuffleCost(PromEltTy,
5211 ReplicationFactor, VF,
5212 DemandedDstElts, CostKind);
5213 }
5214
5215 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
5216 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
5217 "We expect that the legalization doesn't affect the element width, "
5218 "doesn't coalesce/split elements.");
5219
5220 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
5221 unsigned NumDstVectors =
5222 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
5223
5224 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
5225
5226 // Not all the produced Dst elements may be demanded. In our case,
5227 // given that a single Dst vector is formed by a single shuffle,
5228 // if all elements that will form a single Dst vector aren't demanded,
5229 // then we won't need to do that shuffle, so adjust the cost accordingly.
5230 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
5231 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
5232 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
5233
5234 InstructionCost SingleShuffleCost =
5235 getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, SingleDstVecTy,
5236 /*Mask=*/{}, CostKind,
5237 /*Index=*/0, /*SubTp=*/nullptr);
5238 return NumDstVectorsDemanded * SingleShuffleCost;
5239}
5240
5242 Align Alignment,
5243 unsigned AddressSpace,
5245 TTI::OperandValueInfo OpInfo,
5246 const Instruction *I) const {
5247 // TODO: Handle other cost kinds.
5249 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
5250 // Store instruction with index and scale costs 2 Uops.
5251 // Check the preceding GEP to identify non-const indices.
5252 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
5253 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
5254 return TTI::TCC_Basic * 2;
5255 }
5256 }
5257 return TTI::TCC_Basic;
5258 }
5259
5260 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5261 "Invalid Opcode");
5262 // Type legalization can't handle structs
5263 if (TLI->getValueType(DL, Src, true) == MVT::Other)
5264 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5265 CostKind, OpInfo, I);
5266
5267 // Legalize the type.
5268 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
5269
5270 auto *VTy = dyn_cast<FixedVectorType>(Src);
5271
5273
5274 // Add a cost for constant load to vector.
5275 if (Opcode == Instruction::Store && OpInfo.isConstant())
5276 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
5277 /*AddressSpace=*/0, CostKind, OpInfo);
5278
5279 // Handle the simple case of non-vectors.
5280 // NOTE: this assumes that legalization never creates vector from scalars!
5281 if (!VTy || !LT.second.isVector()) {
5282 // Each load/store unit costs 1.
5283 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
5284 }
5285
5286 bool IsLoad = Opcode == Instruction::Load;
5287
5288 Type *EltTy = VTy->getElementType();
5289
5290 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
5291
5292 // Source of truth: how many elements were there in the original IR vector?
5293 const unsigned SrcNumElt = VTy->getNumElements();
5294
5295 // How far have we gotten?
5296 int NumEltRemaining = SrcNumElt;
5297 // Note that we intentionally capture by-reference, NumEltRemaining changes.
5298 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
5299
5300 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
5301
5302 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
5303 const unsigned XMMBits = 128;
5304 if (XMMBits % EltTyBits != 0)
5305 // Vector size must be a multiple of the element size. I.e. no padding.
5306 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5307 CostKind, OpInfo, I);
5308 const int NumEltPerXMM = XMMBits / EltTyBits;
5309
5310 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
5311
5312 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
5313 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
5314 // How many elements would a single op deal with at once?
5315 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
5316 // Vector size must be a multiple of the element size. I.e. no padding.
5317 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5318 CostKind, OpInfo, I);
5319 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
5320
5321 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
5322 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
5323 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
5324 "Unless we haven't halved the op size yet, "
5325 "we have less than two op's sized units of work left.");
5326
5327 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
5328 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
5329 : XMMVecTy;
5330
5331 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
5332 "After halving sizes, the vector elt count is no longer a multiple "
5333 "of number of elements per operation?");
5334 auto *CoalescedVecTy =
5335 CurrNumEltPerOp == 1
5336 ? CurrVecTy
5338 IntegerType::get(Src->getContext(),
5339 EltTyBits * CurrNumEltPerOp),
5340 CurrVecTy->getNumElements() / CurrNumEltPerOp);
5341 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
5342 DL.getTypeSizeInBits(CurrVecTy) &&
5343 "coalesciing elements doesn't change vector width.");
5344
5345 while (NumEltRemaining > 0) {
5346 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
5347
5348 // Can we use this vector size, as per the remaining element count?
5349 // Iff the vector is naturally aligned, we can do a wide load regardless.
5350 if (NumEltRemaining < CurrNumEltPerOp &&
5351 (!IsLoad || Alignment < CurrOpSizeBytes) && CurrOpSizeBytes != 1)
5352 break; // Try smalled vector size.
5353
5354 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5355 // as a proxy for a double-pumped AVX memory interface such as on
5356 // Sandybridge.
5357 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5358 // will be scalarized.
5359 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5360 Cost += 2;
5361 else if (CurrOpSizeBytes < 4)
5362 Cost += 2;
5363 else
5364 Cost += 1;
5365
5366 // If we're loading a uniform value, then we don't need to split the load,
5367 // loading just a single (widest) vector can be reused by all splits.
5368 if (IsLoad && OpInfo.isUniform())
5369 return Cost;
5370
5371 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5372
5373 // If we have fully processed the previous reg, we need to replenish it.
5374 if (SubVecEltsLeft == 0) {
5375 SubVecEltsLeft += CurrVecTy->getNumElements();
5376 // And that's free only for the 0'th subvector of a legalized vector.
5377 if (!Is0thSubVec)
5378 Cost +=
5381 VTy, VTy, {}, CostKind, NumEltDone(), CurrVecTy);
5382 }
5383
5384 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5385 // for smaller widths (32/16/8) we have to insert/extract them separately.
5386 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5387 // but let's pretend that it is also true for 16/8 bit wide ops...)
5388 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5389 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5390 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5391 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5392 APInt DemandedElts =
5393 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5394 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5395 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5396 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5397 !IsLoad, CostKind);
5398 }
5399
5400 SubVecEltsLeft -= CurrNumEltPerOp;
5401 NumEltRemaining -= CurrNumEltPerOp;
5402 Alignment = commonAlignment(Alignment, CurrOpSizeBytes);
5403 }
5404 }
5405
5406 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5407
5408 return Cost;
5409}
5410
5412X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
5413 unsigned AddressSpace,
5415 bool IsLoad = (Instruction::Load == Opcode);
5416 bool IsStore = (Instruction::Store == Opcode);
5417
5418 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5419 if (!SrcVTy)
5420 // To calculate scalar take the regular cost, without mask
5421 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5422
5423 unsigned NumElem = SrcVTy->getNumElements();
5424 auto *MaskTy =
5425 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5426 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment, AddressSpace)) ||
5427 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment, AddressSpace))) {
5428 // Scalarization
5429 APInt DemandedElts = APInt::getAllOnes(NumElem);
5431 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5432 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5433 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5435 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5436 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5438 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5439 InstructionCost MemopCost =
5440 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5441 Alignment, AddressSpace, CostKind);
5442 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5443 }
5444
5445 // Legalize the type.
5446 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5447 auto VT = TLI->getValueType(DL, SrcVTy);
5449 MVT Ty = LT.second;
5450 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5451 // APX masked load/store for scalar is cheap.
5452 return Cost + LT.first;
5453
5454 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5455 LT.second.getVectorNumElements() == NumElem)
5456 // Promotion requires extend/truncate for data and a shuffle for mask.
5457 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, SrcVTy, {}, CostKind,
5458 0, nullptr) +
5459 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, MaskTy, {}, CostKind,
5460 0, nullptr);
5461
5462 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5463 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5464 (unsigned)LT.first.getValue() *
5466 // Expanding requires fill mask with zeroes
5467 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, NewMaskTy, {},
5468 CostKind, 0, MaskTy);
5469 }
5470
5471 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5472 if (!ST->hasAVX512())
5473 return Cost + LT.first * (IsLoad ? 2 : 8);
5474
5475 // AVX-512 masked load/store is cheaper
5476 return Cost + LT.first;
5477}
5478
5480 ArrayRef<const Value *> Ptrs, const Value *Base,
5481 const TTI::PointersChainInfo &Info, Type *AccessTy,
5483 if (Info.isSameBase() && Info.isKnownStride()) {
5484 // If all the pointers have known stride all the differences are translated
5485 // into constants. X86 memory addressing allows encoding it into
5486 // displacement. So we just need to take the base GEP cost.
5487 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5488 SmallVector<const Value *> Indices(BaseGEP->indices());
5489 return getGEPCost(BaseGEP->getSourceElementType(),
5490 BaseGEP->getPointerOperand(), Indices, nullptr,
5491 CostKind);
5492 }
5493 return TTI::TCC_Free;
5494 }
5495 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5496}
5497
5500 const SCEV *Ptr,
5502 // Address computations in vectorized code with non-consecutive addresses will
5503 // likely result in more instructions compared to scalar code where the
5504 // computation can more often be merged into the index mode. The resulting
5505 // extra micro-ops can significantly decrease throughput.
5506 const unsigned NumVectorInstToHideOverhead = 10;
5507
5508 // Cost modeling of Strided Access Computation is hidden by the indexing
5509 // modes of X86 regardless of the stride value. We dont believe that there
5510 // is a difference between constant strided access in gerenal and constant
5511 // strided value which is less than or equal to 64.
5512 // Even in the case of (loop invariant) stride whose value is not known at
5513 // compile time, the address computation will not incur more than one extra
5514 // ADD instruction.
5515 if (PtrTy->isVectorTy() && SE && !ST->hasAVX2()) {
5516 // TODO: AVX2 is the current cut-off because we don't have correct
5517 // interleaving costs for prior ISA's.
5519 return NumVectorInstToHideOverhead;
5521 return 1;
5522 }
5523
5524 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
5525}
5526
5529 std::optional<FastMathFlags> FMF,
5532 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5533
5534 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5535 // and make it as the cost.
5536
5537 static const CostTblEntry SLMCostTbl[] = {
5538 { ISD::FADD, MVT::v2f64, 3 },
5539 { ISD::ADD, MVT::v2i64, 5 },
5540 };
5541
5542 static const CostTblEntry SSE2CostTbl[] = {
5543 { ISD::FADD, MVT::v2f64, 2 },
5544 { ISD::FADD, MVT::v2f32, 2 },
5545 { ISD::FADD, MVT::v4f32, 4 },
5546 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5547 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5548 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5549 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5550 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5551 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5552 { ISD::ADD, MVT::v2i8, 2 },
5553 { ISD::ADD, MVT::v4i8, 2 },
5554 { ISD::ADD, MVT::v8i8, 2 },
5555 { ISD::ADD, MVT::v16i8, 3 },
5556 };
5557
5558 static const CostTblEntry AVX1CostTbl[] = {
5559 { ISD::FADD, MVT::v4f64, 3 },
5560 { ISD::FADD, MVT::v4f32, 3 },
5561 { ISD::FADD, MVT::v8f32, 4 },
5562 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5563 { ISD::ADD, MVT::v4i64, 3 },
5564 { ISD::ADD, MVT::v8i32, 5 },
5565 { ISD::ADD, MVT::v16i16, 5 },
5566 { ISD::ADD, MVT::v32i8, 4 },
5567 };
5568
5569 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5570 assert(ISD && "Invalid opcode");
5571
5572 // Before legalizing the type, give a chance to look up illegal narrow types
5573 // in the table.
5574 // FIXME: Is there a better way to do this?
5575 EVT VT = TLI->getValueType(DL, ValTy);
5576 if (VT.isSimple()) {
5577 MVT MTy = VT.getSimpleVT();
5578 if (ST->useSLMArithCosts())
5579 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5580 return Entry->Cost;
5581
5582 if (ST->hasAVX())
5583 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5584 return Entry->Cost;
5585
5586 if (ST->hasSSE2())
5587 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5588 return Entry->Cost;
5589 }
5590
5591 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5592
5593 MVT MTy = LT.second;
5594
5595 auto *ValVTy = cast<FixedVectorType>(ValTy);
5596
5597 // Special case: vXi8 mul reductions are performed as vXi16.
5598 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5599 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5600 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5601 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5603 CostKind) +
5604 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5605 }
5606
5607 InstructionCost ArithmeticCost = 0;
5608 if (LT.first != 1 && MTy.isVector() &&
5609 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5610 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5611 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5612 MTy.getVectorNumElements());
5613 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5614 ArithmeticCost *= LT.first - 1;
5615 }
5616
5617 if (ST->useSLMArithCosts())
5618 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5619 return ArithmeticCost + Entry->Cost;
5620
5621 if (ST->hasAVX())
5622 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5623 return ArithmeticCost + Entry->Cost;
5624
5625 if (ST->hasSSE2())
5626 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5627 return ArithmeticCost + Entry->Cost;
5628
5629 // FIXME: These assume a naive kshift+binop lowering, which is probably
5630 // conservative in most cases.
5631 static const CostTblEntry AVX512BoolReduction[] = {
5632 { ISD::AND, MVT::v2i1, 3 },
5633 { ISD::AND, MVT::v4i1, 5 },
5634 { ISD::AND, MVT::v8i1, 7 },
5635 { ISD::AND, MVT::v16i1, 9 },
5636 { ISD::AND, MVT::v32i1, 11 },
5637 { ISD::AND, MVT::v64i1, 13 },
5638 { ISD::OR, MVT::v2i1, 3 },
5639 { ISD::OR, MVT::v4i1, 5 },
5640 { ISD::OR, MVT::v8i1, 7 },
5641 { ISD::OR, MVT::v16i1, 9 },
5642 { ISD::OR, MVT::v32i1, 11 },
5643 { ISD::OR, MVT::v64i1, 13 },
5644 };
5645
5646 static const CostTblEntry AVX2BoolReduction[] = {
5647 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5648 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5649 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5650 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5651 };
5652
5653 static const CostTblEntry AVX1BoolReduction[] = {
5654 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5655 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5656 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5657 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5658 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5659 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5660 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5661 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5662 };
5663
5664 static const CostTblEntry SSE2BoolReduction[] = {
5665 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5666 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5667 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5668 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5669 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5670 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5671 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5672 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5673 };
5674
5675 // Handle bool allof/anyof patterns.
5676 if (ValVTy->getElementType()->isIntegerTy(1)) {
5677 InstructionCost ArithmeticCost = 0;
5678 if (LT.first != 1 && MTy.isVector() &&
5679 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5680 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5681 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5682 MTy.getVectorNumElements());
5683 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5684 ArithmeticCost *= LT.first - 1;
5685 }
5686
5687 if (ST->hasAVX512())
5688 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5689 return ArithmeticCost + Entry->Cost;
5690 if (ST->hasAVX2())
5691 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5692 return ArithmeticCost + Entry->Cost;
5693 if (ST->hasAVX())
5694 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5695 return ArithmeticCost + Entry->Cost;
5696 if (ST->hasSSE2())
5697 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5698 return ArithmeticCost + Entry->Cost;
5699
5700 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5701 }
5702
5703 unsigned NumVecElts = ValVTy->getNumElements();
5704 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5705
5706 // Special case power of 2 reductions where the scalar type isn't changed
5707 // by type legalization.
5708 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5709 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5710
5711 InstructionCost ReductionCost = 0;
5712
5713 auto *Ty = ValVTy;
5714 if (LT.first != 1 && MTy.isVector() &&
5715 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5716 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5717 Ty = FixedVectorType::get(ValVTy->getElementType(),
5718 MTy.getVectorNumElements());
5719 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5720 ReductionCost *= LT.first - 1;
5721 NumVecElts = MTy.getVectorNumElements();
5722 }
5723
5724 // Now handle reduction with the legal type, taking into account size changes
5725 // at each level.
5726 while (NumVecElts > 1) {
5727 // Determine the size of the remaining vector we need to reduce.
5728 unsigned Size = NumVecElts * ScalarSize;
5729 NumVecElts /= 2;
5730 // If we're reducing from 256/512 bits, use an extract_subvector.
5731 if (Size > 128) {
5732 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5733 ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
5734 CostKind, NumVecElts, SubTy);
5735 Ty = SubTy;
5736 } else if (Size == 128) {
5737 // Reducing from 128 bits is a permute of v2f64/v2i64.
5738 FixedVectorType *ShufTy;
5739 if (ValVTy->isFloatingPointTy())
5740 ShufTy =
5741 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5742 else
5743 ShufTy =
5744 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5745 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5746 {}, CostKind, 0, nullptr);
5747 } else if (Size == 64) {
5748 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5749 FixedVectorType *ShufTy;
5750 if (ValVTy->isFloatingPointTy())
5751 ShufTy =
5752 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5753 else
5754 ShufTy =
5755 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5756 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5757 {}, CostKind, 0, nullptr);
5758 } else {
5759 // Reducing from smaller size is a shift by immediate.
5760 auto *ShiftTy = FixedVectorType::get(
5761 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5762 ReductionCost += getArithmeticInstrCost(
5763 Instruction::LShr, ShiftTy, CostKind,
5766 }
5767
5768 // Add the arithmetic op for this level.
5769 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5770 }
5771
5772 // Add the final extract element to the cost.
5773 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5774 CostKind, 0, nullptr, nullptr);
5775}
5776
5779 FastMathFlags FMF) const {
5780 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5781 return getIntrinsicInstrCost(ICA, CostKind);
5782}
5783
5786 FastMathFlags FMF,
5788 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5789
5790 MVT MTy = LT.second;
5791
5792 int ISD;
5793 if (ValTy->isIntOrIntVectorTy()) {
5794 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5795 : ISD::SMIN;
5796 } else {
5797 assert(ValTy->isFPOrFPVectorTy() &&
5798 "Expected float point or integer vector type.");
5799 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5800 ? ISD::FMINNUM
5801 : ISD::FMINIMUM;
5802 }
5803
5804 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5805 // and make it as the cost.
5806
5807 static const CostTblEntry SSE2CostTbl[] = {
5808 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5809 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5810 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5811 };
5812
5813 static const CostTblEntry SSE41CostTbl[] = {
5814 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5815 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5816 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5817 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5818 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5819 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5820 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5821 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5822 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5823 {ISD::SMIN, MVT::v16i8, 6},
5824 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5825 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5826 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5827 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5828 };
5829
5830 static const CostTblEntry AVX1CostTbl[] = {
5831 {ISD::SMIN, MVT::v16i16, 6},
5832 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5833 {ISD::SMIN, MVT::v32i8, 8},
5834 {ISD::UMIN, MVT::v32i8, 8},
5835 };
5836
5837 static const CostTblEntry AVX512BWCostTbl[] = {
5838 {ISD::SMIN, MVT::v32i16, 8},
5839 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5840 {ISD::SMIN, MVT::v64i8, 10},
5841 {ISD::UMIN, MVT::v64i8, 10},
5842 };
5843
5844 // Before legalizing the type, give a chance to look up illegal narrow types
5845 // in the table.
5846 // FIXME: Is there a better way to do this?
5847 EVT VT = TLI->getValueType(DL, ValTy);
5848 if (VT.isSimple()) {
5849 MVT MTy = VT.getSimpleVT();
5850 if (ST->hasBWI())
5851 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5852 return Entry->Cost;
5853
5854 if (ST->hasAVX())
5855 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5856 return Entry->Cost;
5857
5858 if (ST->hasSSE41())
5859 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5860 return Entry->Cost;
5861
5862 if (ST->hasSSE2())
5863 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5864 return Entry->Cost;
5865 }
5866
5867 auto *ValVTy = cast<FixedVectorType>(ValTy);
5868 unsigned NumVecElts = ValVTy->getNumElements();
5869
5870 auto *Ty = ValVTy;
5871 InstructionCost MinMaxCost = 0;
5872 if (LT.first != 1 && MTy.isVector() &&
5873 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5874 // Type needs to be split. We need LT.first - 1 operations ops.
5875 Ty = FixedVectorType::get(ValVTy->getElementType(),
5876 MTy.getVectorNumElements());
5877 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5878 MinMaxCost *= LT.first - 1;
5879 NumVecElts = MTy.getVectorNumElements();
5880 }
5881
5882 if (ST->hasBWI())
5883 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5884 return MinMaxCost + Entry->Cost;
5885
5886 if (ST->hasAVX())
5887 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5888 return MinMaxCost + Entry->Cost;
5889
5890 if (ST->hasSSE41())
5891 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5892 return MinMaxCost + Entry->Cost;
5893
5894 if (ST->hasSSE2())
5895 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5896 return MinMaxCost + Entry->Cost;
5897
5898 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5899
5900 // Special case power of 2 reductions where the scalar type isn't changed
5901 // by type legalization.
5902 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5903 ScalarSize != MTy.getScalarSizeInBits())
5904 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5905
5906 // Now handle reduction with the legal type, taking into account size changes
5907 // at each level.
5908 while (NumVecElts > 1) {
5909 // Determine the size of the remaining vector we need to reduce.
5910 unsigned Size = NumVecElts * ScalarSize;
5911 NumVecElts /= 2;
5912 // If we're reducing from 256/512 bits, use an extract_subvector.
5913 if (Size > 128) {
5914 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5915 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
5916 CostKind, NumVecElts, SubTy);
5917 Ty = SubTy;
5918 } else if (Size == 128) {
5919 // Reducing from 128 bits is a permute of v2f64/v2i64.
5920 VectorType *ShufTy;
5921 if (ValTy->isFloatingPointTy())
5922 ShufTy =
5924 else
5925 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5926 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
5927 CostKind, 0, nullptr);
5928 } else if (Size == 64) {
5929 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5930 FixedVectorType *ShufTy;
5931 if (ValTy->isFloatingPointTy())
5932 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5933 else
5934 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5935 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
5936 CostKind, 0, nullptr);
5937 } else {
5938 // Reducing from smaller size is a shift by immediate.
5939 auto *ShiftTy = FixedVectorType::get(
5940 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5941 MinMaxCost += getArithmeticInstrCost(
5942 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5945 }
5946
5947 // Add the arithmetic op for this level.
5948 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5949 }
5950
5951 // Add the final extract element to the cost.
5952 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5953 CostKind, 0, nullptr, nullptr);
5954}
5955
5956/// Calculate the cost of materializing a 64-bit value. This helper
5957/// method might only calculate a fraction of a larger immediate. Therefore it
5958/// is valid to return a cost of ZERO.
5960 if (Val == 0)
5961 return TTI::TCC_Free;
5962
5963 if (isInt<32>(Val))
5964 return TTI::TCC_Basic;
5965
5966 return 2 * TTI::TCC_Basic;
5967}
5968
5971 assert(Ty->isIntegerTy());
5972
5973 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5974 if (BitSize == 0)
5975 return ~0U;
5976
5977 // Never hoist constants larger than 128bit, because this might lead to
5978 // incorrect code generation or assertions in codegen.
5979 // Fixme: Create a cost model for types larger than i128 once the codegen
5980 // issues have been fixed.
5981 if (BitSize > 128)
5982 return TTI::TCC_Free;
5983
5984 if (Imm == 0)
5985 return TTI::TCC_Free;
5986
5987 // Sign-extend all constants to a multiple of 64-bit.
5988 APInt ImmVal = Imm;
5989 if (BitSize % 64 != 0)
5990 ImmVal = Imm.sext(alignTo(BitSize, 64));
5991
5992 // Split the constant into 64-bit chunks and calculate the cost for each
5993 // chunk.
5995 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5996 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5997 int64_t Val = Tmp.getSExtValue();
5998 Cost += getIntImmCost(Val);
5999 }
6000 // We need at least one instruction to materialize the constant.
6001 return std::max<InstructionCost>(1, Cost);
6002}
6003
6005 const APInt &Imm, Type *Ty,
6007 Instruction *Inst) const {
6008 assert(Ty->isIntegerTy());
6009
6010 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6011 unsigned ImmBitWidth = Imm.getBitWidth();
6012
6013 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6014 // here, so that constant hoisting will ignore this constant.
6015 if (BitSize == 0)
6016 return TTI::TCC_Free;
6017
6018 unsigned ImmIdx = ~0U;
6019 switch (Opcode) {
6020 default:
6021 return TTI::TCC_Free;
6022 case Instruction::GetElementPtr:
6023 // Always hoist the base address of a GetElementPtr. This prevents the
6024 // creation of new constants for every base constant that gets constant
6025 // folded with the offset.
6026 if (Idx == 0)
6027 return 2 * TTI::TCC_Basic;
6028 return TTI::TCC_Free;
6029 case Instruction::Store:
6030 ImmIdx = 0;
6031 break;
6032 case Instruction::ICmp:
6033 // This is an imperfect hack to prevent constant hoisting of
6034 // compares that might be trying to check if a 64-bit value fits in
6035 // 32-bits. The backend can optimize these cases using a right shift by 32.
6036 // There are other predicates and immediates the backend can use shifts for.
6037 if (Idx == 1 && ImmBitWidth == 64) {
6038 uint64_t ImmVal = Imm.getZExtValue();
6039 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
6040 return TTI::TCC_Free;
6041
6042 if (auto *Cmp = dyn_cast_or_null<CmpInst>(Inst)) {
6043 if (Cmp->isEquality()) {
6044 KnownBits Known = computeKnownBits(Cmp->getOperand(0), DL);
6045 if (Known.countMinTrailingZeros() >= 32)
6046 return TTI::TCC_Free;
6047 }
6048 }
6049 }
6050 ImmIdx = 1;
6051 break;
6052 case Instruction::And:
6053 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
6054 // by using a 32-bit operation with implicit zero extension. Detect such
6055 // immediates here as the normal path expects bit 31 to be sign extended.
6056 if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(32))
6057 return TTI::TCC_Free;
6058 // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits.
6059 if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() &&
6060 Imm.isMask())
6061 return X86TTIImpl::getIntImmCost(ST->hasBMI2() ? 255 : 65535);
6062 ImmIdx = 1;
6063 break;
6064 case Instruction::Add:
6065 case Instruction::Sub:
6066 // For add/sub, we can use the opposite instruction for INT32_MIN.
6067 if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000)
6068 return TTI::TCC_Free;
6069 ImmIdx = 1;
6070 break;
6071 case Instruction::UDiv:
6072 case Instruction::SDiv:
6073 case Instruction::URem:
6074 case Instruction::SRem:
6075 // Division by constant is typically expanded later into a different
6076 // instruction sequence. This completely changes the constants.
6077 // Report them as "free" to stop ConstantHoist from marking them as opaque.
6078 return TTI::TCC_Free;
6079 case Instruction::Mul:
6080 case Instruction::Or:
6081 case Instruction::Xor:
6082 ImmIdx = 1;
6083 break;
6084 // Always return TCC_Free for the shift value of a shift instruction.
6085 case Instruction::Shl:
6086 case Instruction::LShr:
6087 case Instruction::AShr:
6088 if (Idx == 1)
6089 return TTI::TCC_Free;
6090 break;
6091 case Instruction::Trunc:
6092 case Instruction::ZExt:
6093 case Instruction::SExt:
6094 case Instruction::IntToPtr:
6095 case Instruction::PtrToInt:
6096 case Instruction::BitCast:
6097 case Instruction::PHI:
6098 case Instruction::Call:
6099 case Instruction::Select:
6100 case Instruction::Ret:
6101 case Instruction::Load:
6102 break;
6103 }
6104
6105 if (Idx == ImmIdx) {
6106 uint64_t NumConstants = divideCeil(BitSize, 64);
6108 return (Cost <= NumConstants * TTI::TCC_Basic)
6109 ? static_cast<int>(TTI::TCC_Free)
6110 : Cost;
6111 }
6112
6113 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6114}
6115
6118 const APInt &Imm, Type *Ty,
6120 assert(Ty->isIntegerTy());
6121
6122 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6123 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6124 // here, so that constant hoisting will ignore this constant.
6125 if (BitSize == 0)
6126 return TTI::TCC_Free;
6127
6128 switch (IID) {
6129 default:
6130 return TTI::TCC_Free;
6131 case Intrinsic::sadd_with_overflow:
6132 case Intrinsic::uadd_with_overflow:
6133 case Intrinsic::ssub_with_overflow:
6134 case Intrinsic::usub_with_overflow:
6135 case Intrinsic::smul_with_overflow:
6136 case Intrinsic::umul_with_overflow:
6137 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
6138 return TTI::TCC_Free;
6139 break;
6140 case Intrinsic::experimental_stackmap:
6141 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6142 return TTI::TCC_Free;
6143 break;
6144 case Intrinsic::experimental_patchpoint_void:
6145 case Intrinsic::experimental_patchpoint:
6146 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6147 return TTI::TCC_Free;
6148 break;
6149 }
6150 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6151}
6152
6155 const Instruction *I) const {
6157 return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic;
6158 // Branches are assumed to be predicted.
6159 return TTI::TCC_Free;
6160}
6161
6162int X86TTIImpl::getGatherOverhead() const {
6163 // Some CPUs have more overhead for gather. The specified overhead is relative
6164 // to the Load operation. "2" is the number provided by Intel architects. This
6165 // parameter is used for cost estimation of Gather Op and comparison with
6166 // other alternatives.
6167 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
6168 // enable gather with a -march.
6169 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
6170 return 2;
6171
6172 return 1024;
6173}
6174
6175int X86TTIImpl::getScatterOverhead() const {
6176 if (ST->hasAVX512())
6177 return 2;
6178
6179 return 1024;
6180}
6181
6182// Return an average cost of Gather / Scatter instruction, maybe improved later.
6183InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
6185 Type *SrcVTy, const Value *Ptr,
6186 Align Alignment,
6187 unsigned AddressSpace) const {
6188
6189 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
6190 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
6191
6192 // Try to reduce index size from 64 bit (default for GEP)
6193 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
6194 // operation will use 16 x 64 indices which do not fit in a zmm and needs
6195 // to split. Also check that the base pointer is the same for all lanes,
6196 // and that there's at most one variable index.
6197 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
6198 unsigned IndexSize = DL.getPointerSizeInBits();
6199 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
6200 if (IndexSize < 64 || !GEP)
6201 return IndexSize;
6202
6203 unsigned NumOfVarIndices = 0;
6204 const Value *Ptrs = GEP->getPointerOperand();
6205 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
6206 return IndexSize;
6207 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
6208 if (isa<Constant>(GEP->getOperand(I)))
6209 continue;
6210 Type *IndxTy = GEP->getOperand(I)->getType();
6211 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
6212 IndxTy = IndexVTy->getElementType();
6213 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
6214 !isa<SExtInst>(GEP->getOperand(I))) ||
6215 ++NumOfVarIndices > 1)
6216 return IndexSize; // 64
6217 }
6218 return (unsigned)32;
6219 };
6220
6221 // Trying to reduce IndexSize to 32 bits for vector 16.
6222 // By default the IndexSize is equal to pointer size.
6223 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
6224 ? getIndexSizeInBits(Ptr, DL)
6226
6227 auto *IndexVTy = FixedVectorType::get(
6228 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
6229 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
6230 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
6231 InstructionCost::CostType SplitFactor =
6232 std::max(IdxsLT.first, SrcLT.first).getValue();
6233 if (SplitFactor > 1) {
6234 // Handle splitting of vector of pointers
6235 auto *SplitSrcTy =
6236 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
6237 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
6238 Alignment, AddressSpace);
6239 }
6240
6241 // If we didn't split, this will be a single gather/scatter instruction.
6243 return 1;
6244
6245 // The gather / scatter cost is given by Intel architects. It is a rough
6246 // number since we are looking at one instruction in a time.
6247 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
6248 : getScatterOverhead();
6249 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
6250 Alignment, AddressSpace, CostKind);
6251}
6252
6253/// Calculate the cost of Gather / Scatter operation
6255 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
6257 const Instruction *I = nullptr) const {
6258 if ((Opcode == Instruction::Load &&
6259 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
6260 forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
6261 Align(Alignment)))) ||
6262 (Opcode == Instruction::Store &&
6263 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
6264 forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
6265 Align(Alignment)))))
6266 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
6267 Alignment, CostKind, I);
6268
6269 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
6270 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
6271 if (!PtrTy && Ptr->getType()->isVectorTy())
6272 PtrTy = dyn_cast<PointerType>(
6273 cast<VectorType>(Ptr->getType())->getElementType());
6274 assert(PtrTy && "Unexpected type for Ptr argument");
6275 unsigned AddressSpace = PtrTy->getAddressSpace();
6276 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
6277 AddressSpace);
6278}
6279
6281 const TargetTransformInfo::LSRCost &C2) const {
6282 // X86 specific here are "instruction number 1st priority".
6283 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
6284 C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6285 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
6286 C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6287}
6288
6290 return ST->hasMacroFusion() || ST->hasBranchFusion();
6291}
6292
6293static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST) {
6294 if (!ST->hasAVX())
6295 return false;
6296
6297 if (ScalarTy->isPointerTy())
6298 return true;
6299
6300 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6301 return true;
6302
6303 if (ScalarTy->isHalfTy() && ST->hasBWI())
6304 return true;
6305
6306 if (ScalarTy->isBFloatTy() && ST->hasBF16())
6307 return true;
6308
6309 if (!ScalarTy->isIntegerTy())
6310 return false;
6311
6312 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6313 return IntWidth == 32 || IntWidth == 64 ||
6314 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
6315}
6316
6318 unsigned AddressSpace) const {
6319 Type *ScalarTy = DataTy->getScalarType();
6320
6321 // The backend can't handle a single element vector w/o CFCMOV.
6322 if (isa<VectorType>(DataTy) &&
6323 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6324 return ST->hasCF() &&
6325 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/false);
6326
6327 return isLegalMaskedLoadStore(ScalarTy, ST);
6328}
6329
6331 unsigned AddressSpace) const {
6332 Type *ScalarTy = DataTy->getScalarType();
6333
6334 // The backend can't handle a single element vector w/o CFCMOV.
6335 if (isa<VectorType>(DataTy) &&
6336 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6337 return ST->hasCF() &&
6338 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/true);
6339
6340 return isLegalMaskedLoadStore(ScalarTy, ST);
6341}
6342
6343bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) const {
6344 unsigned DataSize = DL.getTypeStoreSize(DataType);
6345 // The only supported nontemporal loads are for aligned vectors of 16 or 32
6346 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
6347 // (the equivalent stores only require AVX).
6348 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
6349 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
6350
6351 return false;
6352}
6353
6354bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) const {
6355 unsigned DataSize = DL.getTypeStoreSize(DataType);
6356
6357 // SSE4A supports nontemporal stores of float and double at arbitrary
6358 // alignment.
6359 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
6360 return true;
6361
6362 // Besides the SSE4A subtarget exception above, only aligned stores are
6363 // available nontemporaly on any other subtarget. And only stores with a size
6364 // of 4..32 bytes (powers of 2, only) are permitted.
6365 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6366 !isPowerOf2_32(DataSize))
6367 return false;
6368
6369 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6370 // loads require AVX2).
6371 if (DataSize == 32)
6372 return ST->hasAVX();
6373 if (DataSize == 16)
6374 return ST->hasSSE1();
6375 return true;
6376}
6377
6379 ElementCount NumElements) const {
6380 // movddup
6381 return ST->hasSSE3() && !NumElements.isScalable() &&
6382 NumElements.getFixedValue() == 2 &&
6383 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6384}
6385
6386bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const {
6387 if (!isa<VectorType>(DataTy))
6388 return false;
6389
6390 if (!ST->hasAVX512())
6391 return false;
6392
6393 // The backend can't handle a single element vector.
6394 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6395 return false;
6396
6397 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6398
6399 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6400 return true;
6401
6402 if (!ScalarTy->isIntegerTy())
6403 return false;
6404
6405 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6406 return IntWidth == 32 || IntWidth == 64 ||
6407 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6408}
6409
6411 Align Alignment) const {
6412 return isLegalMaskedExpandLoad(DataTy, Alignment);
6413}
6414
6415bool X86TTIImpl::supportsGather() const {
6416 // Some CPUs have better gather performance than others.
6417 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6418 // enable gather with a -march.
6419 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6420}
6421
6423 Align Alignment) const {
6424 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6425 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6426 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6427 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6428 // Check, maybe the gather/scatter instruction is better in the VariableMask
6429 // case.
6430 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6431 return NumElts == 1 ||
6432 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6433}
6434
6436 Align Alignment) const {
6437 Type *ScalarTy = DataTy->getScalarType();
6438 if (ScalarTy->isPointerTy())
6439 return true;
6440
6441 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6442 return true;
6443
6444 if (!ScalarTy->isIntegerTy())
6445 return false;
6446
6447 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6448 return IntWidth == 32 || IntWidth == 64;
6449}
6450
6451bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) const {
6452 if (!supportsGather() || !ST->preferGather())
6453 return false;
6454 return isLegalMaskedGatherScatter(DataTy, Alignment);
6455}
6456
6457bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6458 unsigned Opcode1,
6459 const SmallBitVector &OpcodeMask) const {
6460 // ADDSUBPS 4xf32 SSE3
6461 // VADDSUBPS 4xf32 AVX
6462 // VADDSUBPS 8xf32 AVX2
6463 // ADDSUBPD 2xf64 SSE3
6464 // VADDSUBPD 2xf64 AVX
6465 // VADDSUBPD 4xf64 AVX2
6466
6467 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6468 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6469 if (!isPowerOf2_32(NumElements))
6470 return false;
6471 // Check the opcode pattern. We apply the mask on the opcode arguments and
6472 // then check if it is what we expect.
6473 for (int Lane : seq<int>(0, NumElements)) {
6474 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6475 // We expect FSub for even lanes and FAdd for odd lanes.
6476 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6477 return false;
6478 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6479 return false;
6480 }
6481 // Now check that the pattern is supported by the target ISA.
6482 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6483 if (ElemTy->isFloatTy())
6484 return ST->hasSSE3() && NumElements % 4 == 0;
6485 if (ElemTy->isDoubleTy())
6486 return ST->hasSSE3() && NumElements % 2 == 0;
6487 return false;
6488}
6489
6490bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) const {
6491 // AVX2 doesn't support scatter
6492 if (!ST->hasAVX512() || !ST->preferScatter())
6493 return false;
6494 return isLegalMaskedGatherScatter(DataType, Alignment);
6495}
6496
6497bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const {
6498 EVT VT = TLI->getValueType(DL, DataType);
6499 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6500}
6501
6503 // FDIV is always expensive, even if it has a very low uop count.
6504 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6505 if (I->getOpcode() == Instruction::FDiv)
6506 return true;
6507
6509}
6510
6511bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return false; }
6512
6514 const Function *Callee) const {
6515 const TargetMachine &TM = getTLI()->getTargetMachine();
6516
6517 // Work this as a subsetting of subtarget features.
6518 const FeatureBitset &CallerBits =
6519 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6520 const FeatureBitset &CalleeBits =
6521 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6522
6523 // Check whether features are the same (apart from the ignore list).
6524 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6525 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6526 if (RealCallerBits == RealCalleeBits)
6527 return true;
6528
6529 // If the features are a subset, we need to additionally check for calls
6530 // that may become ABI-incompatible as a result of inlining.
6531 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6532 return false;
6533
6534 for (const Instruction &I : instructions(Callee)) {
6535 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6536 // Having more target features is fine for inline ASM and intrinsics.
6537 if (CB->isInlineAsm() || CB->getIntrinsicID() != Intrinsic::not_intrinsic)
6538 continue;
6539
6541 for (Value *Arg : CB->args())
6542 Types.push_back(Arg->getType());
6543 if (!CB->getType()->isVoidTy())
6544 Types.push_back(CB->getType());
6545
6546 // Simple types are always ABI compatible.
6547 auto IsSimpleTy = [](Type *Ty) {
6548 return !Ty->isVectorTy() && !Ty->isAggregateType();
6549 };
6550 if (all_of(Types, IsSimpleTy))
6551 continue;
6552
6553 // Do a precise compatibility check.
6554 if (!areTypesABICompatible(Caller, Callee, Types))
6555 return false;
6556 }
6557 }
6558 return true;
6559}
6560
6562 const Function *Callee,
6563 const ArrayRef<Type *> &Types) const {
6564 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6565 return false;
6566
6567 // If we get here, we know the target features match. If one function
6568 // considers 512-bit vectors legal and the other does not, consider them
6569 // incompatible.
6570 const TargetMachine &TM = getTLI()->getTargetMachine();
6571
6572 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6573 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6574 return true;
6575
6576 // Consider the arguments compatible if they aren't vectors or aggregates.
6577 // FIXME: Look at the size of vectors.
6578 // FIXME: Look at the element types of aggregates to see if there are vectors.
6579 return llvm::none_of(Types,
6580 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6581}
6582
6584X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6586 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6587 Options.NumLoadsPerBlock = 2;
6588 // All GPR and vector loads can be unaligned.
6589 Options.AllowOverlappingLoads = true;
6590 if (IsZeroCmp) {
6591 // Only enable vector loads for equality comparison. Right now the vector
6592 // version is not as fast for three way compare (see #33329).
6593 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6594 if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
6595 Options.LoadSizes.push_back(64);
6596 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6597 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6598 }
6599 if (ST->is64Bit()) {
6600 Options.LoadSizes.push_back(8);
6601 }
6602 Options.LoadSizes.push_back(4);
6603 Options.LoadSizes.push_back(2);
6604 Options.LoadSizes.push_back(1);
6605 return Options;
6606}
6607
6609 return supportsGather();
6610}
6611
6613 return false;
6614}
6615
6617 // TODO: We expect this to be beneficial regardless of arch,
6618 // but there are currently some unexplained performance artifacts on Atom.
6619 // As a temporary solution, disable on Atom.
6620 return !(ST->isAtom());
6621}
6622
6623// Get estimation for interleaved load/store operations and strided load.
6624// \p Indices contains indices for strided load.
6625// \p Factor - the factor of interleaving.
6626// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6628 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6629 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6630 TTI::TargetCostKind CostKind, bool UseMaskForCond,
6631 bool UseMaskForGaps) const {
6632 // VecTy for interleave memop is <VF*Factor x Elt>.
6633 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6634 // VecTy = <12 x i32>.
6635
6636 // Calculate the number of memory operations (NumOfMemOps), required
6637 // for load/store the VecTy.
6638 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6639 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6640 unsigned LegalVTSize = LegalVT.getStoreSize();
6641 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6642
6643 // Get the cost of one memory operation.
6644 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6645 LegalVT.getVectorNumElements());
6646 InstructionCost MemOpCost;
6647 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6648 if (UseMaskedMemOp)
6649 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6651 else
6652 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace,
6653 CostKind);
6654
6655 unsigned VF = VecTy->getNumElements() / Factor;
6656 MVT VT =
6658
6659 InstructionCost MaskCost;
6660 if (UseMaskedMemOp) {
6661 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6662 for (unsigned Index : Indices) {
6663 assert(Index < Factor && "Invalid index for interleaved memory op");
6664 for (unsigned Elm = 0; Elm < VF; Elm++)
6665 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6666 }
6667
6668 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6669
6670 MaskCost = getReplicationShuffleCost(
6671 I1Type, Factor, VF,
6672 UseMaskForGaps ? DemandedLoadStoreElts
6674 CostKind);
6675
6676 // The Gaps mask is invariant and created outside the loop, therefore the
6677 // cost of creating it is not accounted for here. However if we have both
6678 // a MaskForGaps and some other mask that guards the execution of the
6679 // memory access, we need to account for the cost of And-ing the two masks
6680 // inside the loop.
6681 if (UseMaskForGaps) {
6682 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6683 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6684 }
6685 }
6686
6687 if (Opcode == Instruction::Load) {
6688 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6689 // contain the cost of the optimized shuffle sequence that the
6690 // X86InterleavedAccess pass will generate.
6691 // The cost of loads and stores are computed separately from the table.
6692
6693 // X86InterleavedAccess support only the following interleaved-access group.
6694 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6695 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6696 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6697 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6698 };
6699
6700 if (const auto *Entry =
6701 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6702 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6703 //If an entry does not exist, fallback to the default implementation.
6704
6705 // Kind of shuffle depends on number of loaded values.
6706 // If we load the entire data in one register, we can use a 1-src shuffle.
6707 // Otherwise, we'll merge 2 sources in each operation.
6708 TTI::ShuffleKind ShuffleKind =
6709 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6710
6711 InstructionCost ShuffleCost = getShuffleCost(
6712 ShuffleKind, SingleMemOpTy, SingleMemOpTy, {}, CostKind, 0, nullptr);
6713
6714 unsigned NumOfLoadsInInterleaveGrp =
6715 Indices.size() ? Indices.size() : Factor;
6716 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6717 VecTy->getNumElements() / Factor);
6718 InstructionCost NumOfResults =
6719 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6720
6721 // About a half of the loads may be folded in shuffles when we have only
6722 // one result. If we have more than one result, or the loads are masked,
6723 // we do not fold loads at all.
6724 unsigned NumOfUnfoldedLoads =
6725 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6726
6727 // Get a number of shuffle operations per result.
6728 unsigned NumOfShufflesPerResult =
6729 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6730
6731 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6732 // When we have more than one destination, we need additional instructions
6733 // to keep sources.
6734 InstructionCost NumOfMoves = 0;
6735 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6736 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6737
6738 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6739 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6740 NumOfMoves;
6741
6742 return Cost;
6743 }
6744
6745 // Store.
6746 assert(Opcode == Instruction::Store &&
6747 "Expected Store Instruction at this point");
6748 // X86InterleavedAccess support only the following interleaved-access group.
6749 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6750 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6751 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6752 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6753
6754 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6755 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6756 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6757 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6758 };
6759
6760 if (const auto *Entry =
6761 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6762 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6763 //If an entry does not exist, fallback to the default implementation.
6764
6765 // There is no strided stores meanwhile. And store can't be folded in
6766 // shuffle.
6767 unsigned NumOfSources = Factor; // The number of values to be merged.
6768 InstructionCost ShuffleCost =
6769 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, SingleMemOpTy, {},
6770 CostKind, 0, nullptr);
6771 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6772
6773 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6774 // We need additional instructions to keep sources.
6775 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6777 MaskCost +
6778 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6779 NumOfMoves;
6780 return Cost;
6781}
6782
6784 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6785 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6786 bool UseMaskForCond, bool UseMaskForGaps) const {
6787 auto *VecTy = cast<FixedVectorType>(BaseTy);
6788
6789 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6790 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6791 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6792 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6793 return true;
6794 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6795 return ST->hasBWI();
6796 if (EltTy->isBFloatTy())
6797 return ST->hasBF16();
6798 return false;
6799 };
6800 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6802 Opcode, VecTy, Factor, Indices, Alignment,
6803 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6804
6805 if (UseMaskForCond || UseMaskForGaps)
6806 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6807 Alignment, AddressSpace, CostKind,
6808 UseMaskForCond, UseMaskForGaps);
6809
6810 // Get estimation for interleaved load/store operations for SSE-AVX2.
6811 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6812 // computing the cost using a generic formula as a function of generic
6813 // shuffles. We therefore use a lookup table instead, filled according to
6814 // the instruction sequences that codegen currently generates.
6815
6816 // VecTy for interleave memop is <VF*Factor x Elt>.
6817 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6818 // VecTy = <12 x i32>.
6819 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6820
6821 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6822 // the VF=2, while v2i128 is an unsupported MVT vector type
6823 // (see MachineValueType.h::getVectorVT()).
6824 if (!LegalVT.isVector())
6825 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6826 Alignment, AddressSpace, CostKind);
6827
6828 unsigned VF = VecTy->getNumElements() / Factor;
6829 Type *ScalarTy = VecTy->getElementType();
6830 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6831 if (!ScalarTy->isIntegerTy())
6832 ScalarTy =
6833 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6834
6835 // Get the cost of all the memory operations.
6836 // FIXME: discount dead loads.
6837 InstructionCost MemOpCosts =
6838 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
6839
6840 auto *VT = FixedVectorType::get(ScalarTy, VF);
6841 EVT ETy = TLI->getValueType(DL, VT);
6842 if (!ETy.isSimple())
6843 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6844 Alignment, AddressSpace, CostKind);
6845
6846 // TODO: Complete for other data-types and strides.
6847 // Each combination of Stride, element bit width and VF results in a different
6848 // sequence; The cost tables are therefore accessed with:
6849 // Factor (stride) and VectorType=VFxiN.
6850 // The Cost accounts only for the shuffle sequence;
6851 // The cost of the loads/stores is accounted for separately.
6852 //
6853 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6854 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6855 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6856 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6857 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6858 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6859
6860 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6861 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6862 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6863
6864 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6865 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6866 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6867
6868 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6869 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6870 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6871 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6872
6873 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6874 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6875 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6876 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6877 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6878
6879 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6880 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6881 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6882 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6883 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6884
6885 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6886 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6887 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6888 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6889 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6890
6891 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6892 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6893 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6894 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6895
6896 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6897 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6898 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6899 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6900 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6901
6902 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6903 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6904 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6905 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6906 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6907
6908 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6909 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6910 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6911 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6912 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6913
6914 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6915 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6916 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6917 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6918
6919 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6920 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6921 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6922 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6923 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6924
6925 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6926 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6927 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6928 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6929 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6930
6931 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6932 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6933 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6934 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6935
6936 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6937 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6938 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6939
6940 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6941 };
6942
6943 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6944 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6945 };
6946
6947 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6948 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6949 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6950
6951 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6952 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6953
6954 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6955 };
6956
6957 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6958 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6959 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6960
6961 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6962 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6963 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6964
6965 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6966 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6967 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6968 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6969
6970 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6971 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6972 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6973 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6974 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6975
6976 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6977 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6978 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6979 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6980 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6981
6982 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6983 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6984 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6985 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6986 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6987
6988 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6989 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6990 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
6991 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6992 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6993
6994 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
6995 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
6996 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
6997 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
6998
6999 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
7000 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
7001 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
7002 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
7003 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
7004
7005 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
7006 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
7007 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
7008 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
7009 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
7010
7011 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
7012 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
7013 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
7014 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
7015 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
7016
7017 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
7018 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
7019 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
7020 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
7021
7022 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
7023 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
7024 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
7025 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
7026 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
7027
7028 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
7029 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
7030 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
7031 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
7032 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
7033
7034 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
7035 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
7036 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
7037 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
7038
7039 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
7040 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
7041 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
7042 };
7043
7044 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
7045 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
7046 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
7047 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
7048
7049 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
7050 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
7051
7052 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
7053 };
7054
7055 if (Opcode == Instruction::Load) {
7056 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
7057 MemOpCosts](const CostTblEntry *Entry) {
7058 // NOTE: this is just an approximation!
7059 // It can over/under -estimate the cost!
7060 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
7061 };
7062
7063 if (ST->hasAVX2())
7064 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
7065 ETy.getSimpleVT()))
7066 return GetDiscountedCost(Entry);
7067
7068 if (ST->hasSSSE3())
7069 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
7070 ETy.getSimpleVT()))
7071 return GetDiscountedCost(Entry);
7072
7073 if (ST->hasSSE2())
7074 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
7075 ETy.getSimpleVT()))
7076 return GetDiscountedCost(Entry);
7077 } else {
7078 assert(Opcode == Instruction::Store &&
7079 "Expected Store Instruction at this point");
7080 assert((!Indices.size() || Indices.size() == Factor) &&
7081 "Interleaved store only supports fully-interleaved groups.");
7082 if (ST->hasAVX2())
7083 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
7084 ETy.getSimpleVT()))
7085 return MemOpCosts + Entry->Cost;
7086
7087 if (ST->hasSSE2())
7088 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
7089 ETy.getSimpleVT()))
7090 return MemOpCosts + Entry->Cost;
7091 }
7092
7093 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7094 Alignment, AddressSpace, CostKind,
7095 UseMaskForCond, UseMaskForGaps);
7096}
7097
7099 StackOffset BaseOffset,
7100 bool HasBaseReg, int64_t Scale,
7101 unsigned AddrSpace) const {
7102 // Scaling factors are not free at all.
7103 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
7104 // will take 2 allocations in the out of order engine instead of 1
7105 // for plain addressing mode, i.e. inst (reg1).
7106 // E.g.,
7107 // vaddps (%rsi,%rdx), %ymm0, %ymm1
7108 // Requires two allocations (one for the load, one for the computation)
7109 // whereas:
7110 // vaddps (%rsi), %ymm0, %ymm1
7111 // Requires just 1 allocation, i.e., freeing allocations for other operations
7112 // and having less micro operations to execute.
7113 //
7114 // For some X86 architectures, this is even worse because for instance for
7115 // stores, the complex addressing mode forces the instruction to use the
7116 // "load" ports instead of the dedicated "store" port.
7117 // E.g., on Haswell:
7118 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
7119 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
7121 AM.BaseGV = BaseGV;
7122 AM.BaseOffs = BaseOffset.getFixed();
7123 AM.HasBaseReg = HasBaseReg;
7124 AM.Scale = Scale;
7125 AM.ScalableOffset = BaseOffset.getScalable();
7126 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
7127 // Scale represents reg2 * scale, thus account for 1
7128 // as soon as we use a second register.
7129 return AM.Scale != 0;
7131}
7132
7134 // TODO: Hook MispredictPenalty of SchedMachineModel into this.
7135 return 14;
7136}
7137
7139 unsigned Bits = Ty->getScalarSizeInBits();
7140
7141 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
7142 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
7143 if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
7144 return false;
7145
7146 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
7147 // shifts just as cheap as scalar ones.
7148 if (ST->hasAVX2() && (Bits == 32 || Bits == 64))
7149 return false;
7150
7151 // AVX512BW has shifts such as vpsllvw.
7152 if (ST->hasBWI() && Bits == 16)
7153 return false;
7154
7155 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
7156 // fully general vector.
7157 return true;
7158}
7159
7160unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
7161 Type *ScalarValTy) const {
7162 if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
7163 return 4;
7164 }
7165 return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
7166}
7167
7169 SmallVectorImpl<Use *> &Ops) const {
7170 using namespace llvm::PatternMatch;
7171
7172 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
7173 if (!VTy)
7174 return false;
7175
7176 if (I->getOpcode() == Instruction::Mul &&
7177 VTy->getElementType()->isIntegerTy(64)) {
7178 for (auto &Op : I->operands()) {
7179 // Make sure we are not already sinking this operand
7180 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7181 continue;
7182
7183 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
7184 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
7185 if (ST->hasSSE41() &&
7186 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
7187 m_SpecificInt(32)))) {
7188 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7189 Ops.push_back(&Op);
7190 } else if (ST->hasSSE2() &&
7191 match(Op.get(),
7192 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
7193 Ops.push_back(&Op);
7194 }
7195 }
7196
7197 return !Ops.empty();
7198 }
7199
7200 // A uniform shift amount in a vector shift or funnel shift may be much
7201 // cheaper than a generic variable vector shift, so make that pattern visible
7202 // to SDAG by sinking the shuffle instruction next to the shift.
7203 int ShiftAmountOpNum = -1;
7204 if (I->isShift())
7205 ShiftAmountOpNum = 1;
7206 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
7207 if (II->getIntrinsicID() == Intrinsic::fshl ||
7208 II->getIntrinsicID() == Intrinsic::fshr)
7209 ShiftAmountOpNum = 2;
7210 }
7211
7212 if (ShiftAmountOpNum == -1)
7213 return false;
7214
7215 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
7216 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
7217 isVectorShiftByScalarCheap(I->getType())) {
7218 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
7219 return true;
7220 }
7221
7222 return false;
7223}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Common GEP
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
static unsigned getNumElements(Type *Ty)
This file implements the SmallBitVector class.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST)
This file a TargetTransformInfoImplBase conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:1012
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1670
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1488
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1041
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:482
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1562
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:142
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
Definition: BasicTTIImpl.h:558
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:888
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const override
Definition: BasicTTIImpl.h:476
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
Definition: BasicTTIImpl.h:459
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:997
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:681
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:708
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:702
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:701
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:686
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:689
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:703
@ ICMP_EQ
equal
Definition: InstrTypes.h:699
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:706
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:704
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:688
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getPointerSizeInBits(unsigned AS=0) const
The size in bits of the pointer representation in a given address space.
Definition: DataLayout.h:390
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:842
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:674
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:468
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:846
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:323
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:22
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
unsigned getNumElements() const
Definition: DerivedTypes.h:635
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:949
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:319
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Class to represent pointers.
Definition: DerivedTypes.h:700
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:740
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
bool empty() const
Definition: SmallVector.h:82
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:34
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:44
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:43
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
MVT getSimpleValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the MVT corresponding to this LLVM type. See getValueType.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
virtual InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
virtual bool isExpensiveToSpeculativelyExecute(const Instruction *I) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:349
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:273
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:246
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
LLVM_ABI unsigned getIntegerBitWidth() const
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
Base class of all SIMD vector types.
Definition: DerivedTypes.h:430
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
Definition: DerivedTypes.h:490
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:695
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Definition: DerivedTypes.h:553
Type * getElementType() const
Definition: DerivedTypes.h:463
bool hasSSE1() const
Definition: X86Subtarget.h:189
bool hasSSE42() const
Definition: X86Subtarget.h:194
bool useAVX512Regs() const
Definition: X86Subtarget.h:249
bool hasSSE3() const
Definition: X86Subtarget.h:191
bool hasAVX512() const
Definition: X86Subtarget.h:197
bool hasSSE41() const
Definition: X86Subtarget.h:193
bool hasSSE2() const
Definition: X86Subtarget.h:190
bool hasSSSE3() const
Definition: X86Subtarget.h:192
bool hasAVX() const
Definition: X86Subtarget.h:195
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:221
bool hasAVX2() const
Definition: X86Subtarget.h:196
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
bool isLegalNTLoad(Type *DataType, Align Alignment) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace) const override
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
unsigned getRegisterClassForType(bool Vector, Type *Ty) const override
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalNTStore(Type *DataType, Align Alignment) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isVectorShiftByScalarCheap(Type *Ty) const override
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Type) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
unsigned getAtomicMemIntrinsicMaxElementSize() const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) const override
InstructionCost getBranchMispredictPenalty() const override
bool isExpensiveToSpeculativelyExecute(const Instruction *I) const override
bool hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const override
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment) const
bool enableInterleavedAccessVectorization() const override
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
Calculate the cost of Gather / Scatter operation.
unsigned getNumberOfRegisters(unsigned ClassID) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddressSpace) const override
bool hasDivRemOp(Type *DataType, bool IsSigned) const override
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool supportsEfficientVectorElementLoadStore() const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
InstructionCost getIntImmCost(int64_t) const
Calculate the cost of materializing a 64-bit value.
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool canMacroFuseCmp() const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
bool prefersVectorizedAddressing() const override
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:203
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:169
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:3009
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:801
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:774
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:45
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:765
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:862
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:738
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:275
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:826
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:773
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:1002
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:778
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:343
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:563
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:832
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
Definition: ISDOpcodes.h:1059
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:351
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:718
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:960
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1081
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:908
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:838
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:360
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
apint_match m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
Definition: PatternMatch.h:305
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:551
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:390
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:399
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1854
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1980
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
InstructionCost Cost
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition: Sequence.h:305
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:235
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition: CostTable.h:55