LLVM 22.0.0git
X86TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15/// About Cost Model numbers used below it's necessary to say the following:
16/// the numbers correspond to some "generic" X86 CPU instead of usage of a
17/// specific CPU model. Usually the numbers correspond to the CPU where the
18/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
19/// the lookups below the cost is based on Nehalem as that was the first CPU
20/// to support that feature level and thus has most likely the worst case cost,
21/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
22///
23/// Some examples of other technologies/CPUs:
24/// SSE 3 - Pentium4 / Athlon64
25/// SSE 4.1 - Penryn
26/// SSE 4.2 - Nehalem / Silvermont
27/// AVX - Sandy Bridge / Jaguar / Bulldozer
28/// AVX2 - Haswell / Ryzen
29/// AVX-512 - Xeon Phi / Skylake
30///
31/// And some examples of instruction target dependent costs (latency)
32/// divss sqrtss rsqrtss
33/// AMD K7 11-16 19 3
34/// Piledriver 9-24 13-15 5
35/// Jaguar 14 16 2
36/// Pentium II,III 18 30 2
37/// Nehalem 7-14 7-18 3
38/// Haswell 10-13 11 5
39///
40/// Interpreting the 4 TargetCostKind types:
41/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
42/// values reported by the CPU scheduler models (and llvm-mca).
43/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
44/// actual encoding size of the instruction.
45/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
46/// by the CPU scheduler models (and llvm-mca), to ensure that they are
47/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
48/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
49//===----------------------------------------------------------------------===//
50
59#include <optional>
60
61using namespace llvm;
62
63#define DEBUG_TYPE "x86tti"
64
65//===----------------------------------------------------------------------===//
66//
67// X86 cost model.
68//
69//===----------------------------------------------------------------------===//
70
71// Helper struct to store/access costs for each cost kind.
72// TODO: Move this to allow other targets to use it?
74 unsigned RecipThroughputCost = ~0U;
75 unsigned LatencyCost = ~0U;
76 unsigned CodeSizeCost = ~0U;
77 unsigned SizeAndLatencyCost = ~0U;
78
79 std::optional<unsigned>
81 unsigned Cost = ~0U;
82 switch (Kind) {
85 break;
88 break;
91 break;
94 break;
95 }
96 if (Cost == ~0U)
97 return std::nullopt;
98 return Cost;
99 }
100};
103
105X86TTIImpl::getPopcntSupport(unsigned TyWidth) const {
106 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
107 // TODO: Currently the __builtin_popcount() implementation using SSE3
108 // instructions is inefficient. Once the problem is fixed, we should
109 // call ST->hasSSE3() instead of ST->hasPOPCNT().
110 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
111}
112
113std::optional<unsigned> X86TTIImpl::getCacheSize(
115 switch (Level) {
117 // - Penryn
118 // - Nehalem
119 // - Westmere
120 // - Sandy Bridge
121 // - Ivy Bridge
122 // - Haswell
123 // - Broadwell
124 // - Skylake
125 // - Kabylake
126 return 32 * 1024; // 32 KiB
128 // - Penryn
129 // - Nehalem
130 // - Westmere
131 // - Sandy Bridge
132 // - Ivy Bridge
133 // - Haswell
134 // - Broadwell
135 // - Skylake
136 // - Kabylake
137 return 256 * 1024; // 256 KiB
138 }
139
140 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
141}
142
143std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
145 // - Penryn
146 // - Nehalem
147 // - Westmere
148 // - Sandy Bridge
149 // - Ivy Bridge
150 // - Haswell
151 // - Broadwell
152 // - Skylake
153 // - Kabylake
154 switch (Level) {
156 [[fallthrough]];
158 return 8;
159 }
160
161 llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
162}
163
165
167 return Vector ? VectorClass
168 : Ty && Ty->isFloatingPointTy() ? ScalarFPClass
169 : GPRClass;
170}
171
172unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
173 if (ClassID == VectorClass && !ST->hasSSE1())
174 return 0;
175
176 if (!ST->is64Bit())
177 return 8;
178
179 if ((ClassID == GPRClass && ST->hasEGPR()) ||
180 (ClassID != GPRClass && ST->hasAVX512()))
181 return 32;
182
183 return 16;
184}
185
187 if (!ST->hasCF())
188 return false;
189 if (!Ty)
190 return true;
191 // Conditional faulting is supported by CFCMOV, which only accepts
192 // 16/32/64-bit operands.
193 // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
194 // profitable.
195 auto *VTy = dyn_cast<FixedVectorType>(Ty);
196 if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1))
197 return false;
198 auto *ScalarTy = Ty->getScalarType();
199 switch (cast<IntegerType>(ScalarTy)->getBitWidth()) {
200 default:
201 return false;
202 case 16:
203 case 32:
204 case 64:
205 return true;
206 }
207}
208
211 unsigned PreferVectorWidth = ST->getPreferVectorWidth();
212 switch (K) {
214 return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
216 if (ST->hasAVX512() && PreferVectorWidth >= 512)
217 return TypeSize::getFixed(512);
218 if (ST->hasAVX() && PreferVectorWidth >= 256)
219 return TypeSize::getFixed(256);
220 if (ST->hasSSE1() && PreferVectorWidth >= 128)
221 return TypeSize::getFixed(128);
222 return TypeSize::getFixed(0);
224 return TypeSize::getScalable(0);
225 }
226
227 llvm_unreachable("Unsupported register kind");
228}
229
234
236 // If the loop will not be vectorized, don't interleave the loop.
237 // Let regular unroll to unroll the loop, which saves the overflow
238 // check and memory check cost.
239 if (VF.isScalar())
240 return 1;
241
242 if (ST->isAtom())
243 return 1;
244
245 // Sandybridge and Haswell have multiple execution ports and pipelined
246 // vector units.
247 if (ST->hasAVX())
248 return 4;
249
250 return 2;
251}
252
254 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
256 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
257
258 // vXi8 multiplications are always promoted to vXi16.
259 // Sub-128-bit types can be extended/packed more efficiently.
260 if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
261 Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
262 Type *WideVecTy =
264 return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
266 CostKind) +
267 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
269 CostKind) +
270 getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
271 }
272
273 // Legalize the type.
274 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
275
276 int ISD = TLI->InstructionOpcodeToISD(Opcode);
277 assert(ISD && "Invalid opcode");
278
279 if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
280 (LT.second.getScalarType() == MVT::i32 ||
281 LT.second.getScalarType() == MVT::i64)) {
282 // Check if the operands can be represented as a smaller datatype.
283 bool Op1Signed = false, Op2Signed = false;
284 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
285 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
286 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
287 bool SignedMode = Op1Signed || Op2Signed;
288
289 // If both vXi32 are representable as i15 and at least one is constant,
290 // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
291 // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
292 if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
293 LT.second.getScalarType() == MVT::i32) {
294 bool Op1Constant =
295 isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
296 bool Op2Constant =
297 isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
298 bool Op1Sext = isa<SExtInst>(Args[0]) &&
299 (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
300 bool Op2Sext = isa<SExtInst>(Args[1]) &&
301 (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
302
303 bool IsZeroExtended = !Op1Signed || !Op2Signed;
304 bool IsConstant = Op1Constant || Op2Constant;
305 bool IsSext = Op1Sext || Op2Sext;
306 if (IsConstant || IsZeroExtended || IsSext)
307 LT.second =
308 MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
309 }
310
311 // Check if the vXi32 operands can be shrunk into a smaller datatype.
312 // This should match the codegen from reduceVMULWidth.
313 // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
314 if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
315 if (OpMinSize <= 7)
316 return LT.first * 3; // pmullw/sext
317 if (!SignedMode && OpMinSize <= 8)
318 return LT.first * 3; // pmullw/zext
319 if (OpMinSize <= 15)
320 return LT.first * 5; // pmullw/pmulhw/pshuf
321 if (!SignedMode && OpMinSize <= 16)
322 return LT.first * 5; // pmullw/pmulhw/pshuf
323 }
324
325 // If both vXi64 are representable as (unsigned) i32, then we can perform
326 // the multiple with a single PMULUDQ instruction.
327 // TODO: Add (SSE41+) PMULDQ handling for signed extensions.
328 if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
330 }
331
332 // Vector multiply by pow2 will be simplified to shifts.
333 // Vector multiply by -pow2 will be simplified to shifts/negates.
334 if (ISD == ISD::MUL && Op2Info.isConstant() &&
335 (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
337 getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
338 Op1Info.getNoProps(), Op2Info.getNoProps());
339 if (Op2Info.isNegatedPowerOf2())
340 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
341 return Cost;
342 }
343
344 // On X86, vector signed division by constants power-of-two are
345 // normally expanded to the sequence SRA + SRL + ADD + SRA.
346 // The OperandValue properties may not be the same as that of the previous
347 // operation; conservatively assume OP_None.
348 if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
349 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
351 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
352 Op1Info.getNoProps(), Op2Info.getNoProps());
353 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
354 Op1Info.getNoProps(), Op2Info.getNoProps());
355 Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
356 Op1Info.getNoProps(), Op2Info.getNoProps());
357
358 if (ISD == ISD::SREM) {
359 // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
360 Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
361 Op2Info.getNoProps());
362 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
363 Op2Info.getNoProps());
364 }
365
366 return Cost;
367 }
368
369 // Vector unsigned division/remainder will be simplified to shifts/masks.
370 if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
371 Op2Info.isConstant() && Op2Info.isPowerOf2()) {
372 if (ISD == ISD::UDIV)
373 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
374 Op1Info.getNoProps(), Op2Info.getNoProps());
375 // UREM
376 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
377 Op1Info.getNoProps(), Op2Info.getNoProps());
378 }
379
380 static const CostKindTblEntry GFNIUniformConstCostTable[] = {
381 { ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
382 { ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
383 { ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
384 { ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
385 { ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
386 { ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
387 { ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
388 { ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
389 { ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
390 };
391
392 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
393 if (const auto *Entry =
394 CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
395 if (auto KindCost = Entry->Cost[CostKind])
396 return LT.first * *KindCost;
397
398 static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
399 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
400 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
401 { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
402 { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
403 { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
404 { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
405 { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
406 { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
407 { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
408
409 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
410 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
411 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
412 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
413 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
414 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
415 };
416
417 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
418 if (const auto *Entry =
419 CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
420 if (auto KindCost = Entry->Cost[CostKind])
421 return LT.first * *KindCost;
422
423 static const CostKindTblEntry AVX512UniformConstCostTable[] = {
424 { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
425 { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
426 { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
427
428 { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
429 { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
430 { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
431
432 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
433 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
434 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
435 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
436 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
437 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
438
439 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
440 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
441 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
442 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
443 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
444 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
445 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
446
447 { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
448 { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
449 { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
450 { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
451 };
452
453 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
454 if (const auto *Entry =
455 CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
456 if (auto KindCost = Entry->Cost[CostKind])
457 return LT.first * *KindCost;
458
459 static const CostKindTblEntry AVX2UniformConstCostTable[] = {
460 { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
461 { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
462 { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
463 { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
464 { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
465 { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
466
467 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
468 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
469 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
470 { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
471 { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
472 { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
473
474 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
475 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
476 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
477 { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
478 { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
479 { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
480
481 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
482 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
483 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
484 { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
485 { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
486 { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
487
488 { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
489 { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
490 { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
491 { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
492 };
493
494 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
495 if (const auto *Entry =
496 CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
497 if (auto KindCost = Entry->Cost[CostKind])
498 return LT.first * *KindCost;
499
500 static const CostKindTblEntry AVXUniformConstCostTable[] = {
501 { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
502 { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
503 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
504 { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
505 { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
506 { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
507
508 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
509 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
510 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
511 { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
512 { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
513 { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
514
515 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
516 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
517 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
518 { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
519 { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
520 { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
521
522 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
523 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
524 { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
525 { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
526 { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
527 { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
528
529 { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
530 { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
531 { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
532 { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
533 };
534
535 // XOP has faster vXi8 shifts.
536 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
537 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
538 if (const auto *Entry =
539 CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
540 if (auto KindCost = Entry->Cost[CostKind])
541 return LT.first * *KindCost;
542
543 static const CostKindTblEntry SSE2UniformConstCostTable[] = {
544 { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
545 { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
546 { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
547
548 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
549 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
550 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
551
552 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
553 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
554 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
555
556 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
557 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
558 { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
559
560 { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
561 { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
562 { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
563 { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
564 };
565
566 // XOP has faster vXi8 shifts.
567 if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
568 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
569 if (const auto *Entry =
570 CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
571 if (auto KindCost = Entry->Cost[CostKind])
572 return LT.first * *KindCost;
573
574 static const CostKindTblEntry AVX512BWConstCostTable[] = {
575 { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
576 { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
577 { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
578 { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
579
580 { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
581 { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
582 { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
583 { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
584 };
585
586 if (Op2Info.isConstant() && ST->hasBWI())
587 if (const auto *Entry =
588 CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
589 if (auto KindCost = Entry->Cost[CostKind])
590 return LT.first * *KindCost;
591
592 static const CostKindTblEntry AVX512ConstCostTable[] = {
593 { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
594 { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
595 { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
596 { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
597
598 { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
599 { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
600 { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
601 { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
602
603 { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
604 { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
605 { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
606 { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
607 };
608
609 if (Op2Info.isConstant() && ST->hasAVX512())
610 if (const auto *Entry =
611 CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
612 if (auto KindCost = Entry->Cost[CostKind])
613 return LT.first * *KindCost;
614
615 static const CostKindTblEntry AVX2ConstCostTable[] = {
616 { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
617 { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
618 { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
619 { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
620
621 { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
622 { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
623 { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
624 { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
625
626 { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
627 { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
628 { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
629 { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
630 };
631
632 if (Op2Info.isConstant() && ST->hasAVX2())
633 if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
634 if (auto KindCost = Entry->Cost[CostKind])
635 return LT.first * *KindCost;
636
637 static const CostKindTblEntry AVXConstCostTable[] = {
638 { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
639 { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
640 { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
641 { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
642
643 { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
644 { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
645 { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
646 { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
647
648 { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
649 { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
650 { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
651 { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
652 };
653
654 if (Op2Info.isConstant() && ST->hasAVX())
655 if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
656 if (auto KindCost = Entry->Cost[CostKind])
657 return LT.first * *KindCost;
658
659 static const CostKindTblEntry SSE41ConstCostTable[] = {
660 { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
661 { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
662 };
663
664 if (Op2Info.isConstant() && ST->hasSSE41())
665 if (const auto *Entry =
666 CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
667 if (auto KindCost = Entry->Cost[CostKind])
668 return LT.first * *KindCost;
669
670 static const CostKindTblEntry SSE2ConstCostTable[] = {
671 { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
672 { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
673 { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
674 { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
675
676 { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
677 { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
678 { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
679 { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
680
681 { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
682 { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
683 { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
684 { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
685 };
686
687 if (Op2Info.isConstant() && ST->hasSSE2())
688 if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
689 if (auto KindCost = Entry->Cost[CostKind])
690 return LT.first * *KindCost;
691
692 static const CostKindTblEntry AVX512BWUniformCostTable[] = {
693 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
694 { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
695 { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
696 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
697 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
698 { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
699 { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
700 { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
701 { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
702
703 { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
704 { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
705 { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
706 };
707
708 if (ST->hasBWI() && Op2Info.isUniform())
709 if (const auto *Entry =
710 CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
711 if (auto KindCost = Entry->Cost[CostKind])
712 return LT.first * *KindCost;
713
714 static const CostKindTblEntry AVX512UniformCostTable[] = {
715 { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
716 { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
717 { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
718
719 { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
720 { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
721 { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
722
723 { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
724 { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
725 { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
726 { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
727 { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
728 { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
729 { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
730 };
731
732 if (ST->hasAVX512() && Op2Info.isUniform())
733 if (const auto *Entry =
734 CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
735 if (auto KindCost = Entry->Cost[CostKind])
736 return LT.first * *KindCost;
737
738 static const CostKindTblEntry AVX2UniformCostTable[] = {
739 // Uniform splats are cheaper for the following instructions.
740 { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
741 { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
742 { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
743 { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
744 { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
745 { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
746
747 { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
748 { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
749 { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
750 { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
751 { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
752 { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
753
754 { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
755 { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
756 { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
757 { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
758 { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
759 { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
760
761 { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
762 { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
763 { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
764 { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
765 { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
766 { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
767 };
768
769 if (ST->hasAVX2() && Op2Info.isUniform())
770 if (const auto *Entry =
771 CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
772 if (auto KindCost = Entry->Cost[CostKind])
773 return LT.first * *KindCost;
774
775 static const CostKindTblEntry AVXUniformCostTable[] = {
776 { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
777 { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
778 { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
779 { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
780 { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
781 { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
782
783 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
784 { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
785 { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
786 { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
787 { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
788 { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
789
790 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
791 { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
792 { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
793 { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
794 { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
795 { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
796
797 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
798 { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
799 { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
800 { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
801 { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
802 { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
803 };
804
805 // XOP has faster vXi8 shifts.
806 if (ST->hasAVX() && Op2Info.isUniform() &&
807 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
808 if (const auto *Entry =
809 CostTableLookup(AVXUniformCostTable, ISD, LT.second))
810 if (auto KindCost = Entry->Cost[CostKind])
811 return LT.first * *KindCost;
812
813 static const CostKindTblEntry SSE2UniformCostTable[] = {
814 // Uniform splats are cheaper for the following instructions.
815 { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
816 { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
817 { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
818
819 { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
820 { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
821 { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
822
823 { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
824 { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
825 { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
826
827 { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
828 { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
829 { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
830 };
831
832 if (ST->hasSSE2() && Op2Info.isUniform() &&
833 (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
834 if (const auto *Entry =
835 CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
836 if (auto KindCost = Entry->Cost[CostKind])
837 return LT.first * *KindCost;
838
839 static const CostKindTblEntry AVX512DQCostTable[] = {
840 { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
841 { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
842 { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
843 };
844
845 // Look for AVX512DQ lowering tricks for custom cases.
846 if (ST->hasDQI())
847 if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
848 if (auto KindCost = Entry->Cost[CostKind])
849 return LT.first * *KindCost;
850
851 static const CostKindTblEntry AVX512BWCostTable[] = {
852 { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
853 { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
854 { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
855 { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
856 { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
857 { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
858 { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
859 { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
860 { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
861
862 { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
863 { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
864 { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
865 { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
866 { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
867 { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
868 { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
869 { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
870 { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
871
872 { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
873 { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
874
875 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
876 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
877 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
878 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
879
880 { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
881 { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
882
883 { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc
884 { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw
885 { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw
886 { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
887
888 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
889 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
890 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
891 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
892 };
893
894 // Look for AVX512BW lowering tricks for custom cases.
895 if (ST->hasBWI())
896 if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
897 if (auto KindCost = Entry->Cost[CostKind])
898 return LT.first * *KindCost;
899
900 static const CostKindTblEntry AVX512CostTable[] = {
901 { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
902 { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
903 { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
904
905 { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
906 { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
907 { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
908
909 { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
910 { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
911 { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
912 { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
913 { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
914 { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
915 { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
916 { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
917 { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
918
919 { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
920 { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
921 { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
922 { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
923 { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
924 { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
925 { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
926 { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
927 { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
928
929 { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
930 { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
931
932 { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
933 { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
934
935 { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
936 { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
937 { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
938 { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
939
940 { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
941 { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
942 { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
943 { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
944
945 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
946 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
947 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
948 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
949
950 { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
951 { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
952 { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
953 { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
954 { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
955
956 { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
957
958 { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
959 { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
960 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
961 { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
962 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
963 { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
964 { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
965 { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
966 { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
967
968 { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
969 { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
970 { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
971 { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
972
973 { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
974 { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
975 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
976 { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
977 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
978 { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
979 { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
980 { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
981 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
982
983 { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
984 { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
985 { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
986 { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
987 };
988
989 if (ST->hasAVX512())
990 if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
991 if (auto KindCost = Entry->Cost[CostKind])
992 return LT.first * *KindCost;
993
994 static const CostKindTblEntry AVX2ShiftCostTable[] = {
995 // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
996 // customize them to detect the cases where shift amount is a scalar one.
997 { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
998 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
999 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
1000 { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
1001 { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
1002 { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
1003 { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
1004 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
1005 { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
1006 { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
1007 };
1008
1009 if (ST->hasAVX512()) {
1010 if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
1011 // On AVX512, a packed v32i16 shift left by a constant build_vector
1012 // is lowered into a vector multiply (vpmullw).
1013 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1014 Op1Info.getNoProps(), Op2Info.getNoProps());
1015 }
1016
1017 // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
1018 if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
1019 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
1020 Op2Info.isConstant())
1021 // On AVX2, a packed v16i16 shift left by a constant build_vector
1022 // is lowered into a vector multiply (vpmullw).
1023 return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
1024 Op1Info.getNoProps(), Op2Info.getNoProps());
1025
1026 if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
1027 if (auto KindCost = Entry->Cost[CostKind])
1028 return LT.first * *KindCost;
1029 }
1030
1031 static const CostKindTblEntry XOPShiftCostTable[] = {
1032 // 128bit shifts take 1cy, but right shifts require negation beforehand.
1033 { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
1034 { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
1035 { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
1036 { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
1037 { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
1038 { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
1039 { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
1040 { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
1041 { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
1042 { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
1043 { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
1044 { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
1045 // 256bit shifts require splitting if AVX2 didn't catch them above.
1046 { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
1047 { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
1048 { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
1049 { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
1050 { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
1051 { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
1052 { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
1053 { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
1054 { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
1055 { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
1056 { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
1057 { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
1058 };
1059
1060 // Look for XOP lowering tricks.
1061 if (ST->hasXOP()) {
1062 // If the right shift is constant then we'll fold the negation so
1063 // it's as cheap as a left shift.
1064 int ShiftISD = ISD;
1065 if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
1066 ShiftISD = ISD::SHL;
1067 if (const auto *Entry =
1068 CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
1069 if (auto KindCost = Entry->Cost[CostKind])
1070 return LT.first * *KindCost;
1071 }
1072
1073 if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
1074 MVT VT = LT.second;
1075 // Vector shift left by non uniform constant can be lowered
1076 // into vector multiply.
1077 if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
1078 ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
1079 ISD = ISD::MUL;
1080 }
1081
1082 static const CostKindTblEntry GLMCostTable[] = {
1083 { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
1084 { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
1085 { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
1086 { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
1087 };
1088
1089 if (ST->useGLMDivSqrtCosts())
1090 if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
1091 if (auto KindCost = Entry->Cost[CostKind])
1092 return LT.first * *KindCost;
1093
1094 static const CostKindTblEntry SLMCostTable[] = {
1095 { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
1096 { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
1097 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
1098 { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
1099 { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
1100 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
1101 { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
1102 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
1103 { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
1104 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
1105 { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
1106 { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
1107 // v2i64/v4i64 mul is custom lowered as a series of long:
1108 // multiplies(3), shifts(3) and adds(2)
1109 // slm muldq version throughput is 2 and addq throughput 4
1110 // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
1111 // 3X4 (addq throughput) = 17
1112 { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
1113 // slm addq\subq throughput is 4
1114 { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
1115 { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
1116 };
1117
1118 if (ST->useSLMArithCosts())
1119 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
1120 if (auto KindCost = Entry->Cost[CostKind])
1121 return LT.first * *KindCost;
1122
1123 static const CostKindTblEntry AVX2CostTable[] = {
1124 { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
1125 { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
1126 { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
1127 { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1128
1129 { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
1130 { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
1131 { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
1132 { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
1133
1134 { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
1135 { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
1136 { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
1137 { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
1138 { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
1139 { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
1140
1141 { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
1142 { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
1143 { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
1144 { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
1145 { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
1146 { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
1147 { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
1148 { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
1149
1150 { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
1151 { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw
1152 { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
1153 { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
1154 { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
1155 { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
1156 { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
1157
1158 { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
1159
1160 { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
1161 { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
1162
1163 { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
1164 { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
1165 { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
1166 { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
1167 { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
1168 { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
1169
1170 { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
1171 { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
1172 { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
1173 { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
1174 { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
1175 { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
1176
1177 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
1178 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
1179 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
1180 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
1181 { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
1182 { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
1183
1184 { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
1185 { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
1186 { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
1187 { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
1188 { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
1189 { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
1190 };
1191
1192 // Look for AVX2 lowering tricks for custom cases.
1193 if (ST->hasAVX2())
1194 if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
1195 if (auto KindCost = Entry->Cost[CostKind])
1196 return LT.first * *KindCost;
1197
1198 static const CostKindTblEntry AVX1CostTable[] = {
1199 // We don't have to scalarize unsupported ops. We can issue two half-sized
1200 // operations and we only need to extract the upper YMM half.
1201 // Two ops + 1 extract + 1 insert = 4.
1202 { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split
1203 { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or
1204 { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
1205 { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
1206 { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
1207 { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
1208
1209 { X86ISD::PMULUDQ, MVT::v4i64, { 3, 5, 5, 6 } }, // pmuludq + split
1210
1211 { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
1212 { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
1213 { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
1214 { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
1215
1216 { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
1217 { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
1218 { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
1219 { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
1220
1221 { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
1222 { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
1223 { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
1224 { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
1225
1226 { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
1227 { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
1228 { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
1229 { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
1230 { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
1231 { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
1232 { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
1233 { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
1234 { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
1235 { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
1236
1237 { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
1238 { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
1239 { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
1240 { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
1241 { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
1242 { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
1243 { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1244 { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1245
1246 { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
1247 { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
1248 { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1249 { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1250 { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1251 { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1252 { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
1253 { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
1254
1255 { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
1256 { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
1257 { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
1258 { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
1259 { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
1260 { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
1261 { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
1262 { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
1263
1264 { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1265 { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
1266
1267 { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1268 { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1269 { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1270 { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1271 { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1272 { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1273
1274 { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1275 { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1276 { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1277 { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
1278 { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1279 { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
1280
1281 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1282 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1283 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1284 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
1285 { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1286 { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
1287
1288 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1289 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
1290 { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
1291 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1292 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
1293 { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
1294 };
1295
1296 if (ST->hasAVX())
1297 if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
1298 if (auto KindCost = Entry->Cost[CostKind])
1299 return LT.first * *KindCost;
1300
1301 static const CostKindTblEntry SSE42CostTable[] = {
1302 { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1303 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1304 { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1305 { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1306
1307 { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1308 { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1309 { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1310 { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
1311
1312 { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1313 { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1314 { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1315 { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
1316
1317 { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1318 { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
1319 { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1320 { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
1321
1322 { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
1323 };
1324
1325 if (ST->hasSSE42())
1326 if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
1327 if (auto KindCost = Entry->Cost[CostKind])
1328 return LT.first * *KindCost;
1329
1330 static const CostKindTblEntry SSE41CostTable[] = {
1331 { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
1332 { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
1333 { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
1334
1335 { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
1336 { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1337 { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1338 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1339
1340 { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
1341 { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
1342 { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
1343 { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
1344
1345 { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
1346 };
1347
1348 if (ST->hasSSE41())
1349 if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
1350 if (auto KindCost = Entry->Cost[CostKind])
1351 return LT.first * *KindCost;
1352
1353 static const CostKindTblEntry SSSE3CostTable[] = {
1354 { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or
1355 };
1356
1357 if (ST->hasSSSE3())
1358 if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second))
1359 if (auto KindCost = Entry->Cost[CostKind])
1360 return LT.first * *KindCost;
1361
1362 static const CostKindTblEntry SSE2CostTable[] = {
1363 // We don't correctly identify costs of casts because they are marked as
1364 // custom.
1365 { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
1366 { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
1367 { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
1368 { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1369
1370 { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
1371 { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1372 { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1373 { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
1374
1375 { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
1376 { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
1377 { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
1378 { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
1379
1380 { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
1381 { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
1382 { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
1383 { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
1384
1385 { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
1386 { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
1387 { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
1388 { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
1389
1390 { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
1391 { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
1392 { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
1393 { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
1394
1395 { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
1396 { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
1397
1398 { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
1399 { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1400 { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1401 { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
1402
1403 { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
1404
1405 { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
1406 { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
1407 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
1408 { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
1409
1410 { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1411 { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1412 { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1413 { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
1414
1415 { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1416 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1417 { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1418
1419 { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1420 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1421 { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
1422
1423 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1424 { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
1425 };
1426
1427 if (ST->hasSSE2())
1428 if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
1429 if (auto KindCost = Entry->Cost[CostKind])
1430 return LT.first * *KindCost;
1431
1432 static const CostKindTblEntry SSE1CostTable[] = {
1433 { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
1434 { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
1435
1436 { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1437 { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
1438
1439 { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1440 { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1441
1442 { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1443 { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
1444
1445 { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1446 { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
1447 };
1448
1449 if (ST->hasSSE1())
1450 if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
1451 if (auto KindCost = Entry->Cost[CostKind])
1452 return LT.first * *KindCost;
1453
1454 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
1455 { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1456 { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
1457 { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
1458 };
1459
1460 if (ST->is64Bit())
1461 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
1462 if (auto KindCost = Entry->Cost[CostKind])
1463 return LT.first * *KindCost;
1464
1465 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
1466 { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1467 { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1468 { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1469
1470 { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
1471 { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
1472 { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
1473
1474 { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
1475 { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
1476 { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
1477
1478 { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
1479 { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1480 { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
1481 { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
1482 { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
1483 };
1484
1485 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
1486 if (auto KindCost = Entry->Cost[CostKind])
1487 return LT.first * *KindCost;
1488
1489 // It is not a good idea to vectorize division. We have to scalarize it and
1490 // in the process we will often end up having to spilling regular
1491 // registers. The overhead of division is going to dominate most kernels
1492 // anyways so try hard to prevent vectorization of division - it is
1493 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
1494 // to hide "20 cycles" for each lane.
1495 if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
1496 (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
1497 ISD == ISD::UREM)) {
1498 InstructionCost ScalarCost =
1499 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
1500 Op1Info.getNoProps(), Op2Info.getNoProps());
1501 return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
1502 }
1503
1504 // Handle some basic single instruction code size cases.
1505 if (CostKind == TTI::TCK_CodeSize) {
1506 switch (ISD) {
1507 case ISD::FADD:
1508 case ISD::FSUB:
1509 case ISD::FMUL:
1510 case ISD::FDIV:
1511 case ISD::FNEG:
1512 case ISD::AND:
1513 case ISD::OR:
1514 case ISD::XOR:
1515 return LT.first;
1516 break;
1517 }
1518 }
1519
1520 // Fallback to the default implementation.
1521 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1522 Args, CxtI);
1523}
1524
1527 unsigned Opcode1, const SmallBitVector &OpcodeMask,
1529 if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
1530 return TTI::TCC_Basic;
1532}
1533
1535 VectorType *DstTy, VectorType *SrcTy,
1536 ArrayRef<int> Mask,
1538 int Index, VectorType *SubTp,
1540 const Instruction *CxtI) const {
1541 assert((Mask.empty() || DstTy->isScalableTy() ||
1542 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1543 "Expected the Mask to match the return size if given");
1544 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1545 "Expected the same scalar types");
1546
1547 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
1548 // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
1549 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1550
1551 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1552
1553 // If all args are constant than this will be constant folded away.
1554 if (!Args.empty() &&
1555 all_of(Args, [](const Value *Arg) { return isa<Constant>(Arg); }))
1556 return TTI::TCC_Free;
1557
1558 // Recognize a basic concat_vector shuffle.
1559 if (Kind == TTI::SK_PermuteTwoSrc &&
1560 Mask.size() == (2 * SrcTy->getElementCount().getKnownMinValue()) &&
1561 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
1565 CostKind, Mask.size() / 2, SrcTy);
1566
1567 // Treat Transpose as 2-op shuffles - there's no difference in lowering.
1568 if (Kind == TTI::SK_Transpose)
1569 Kind = TTI::SK_PermuteTwoSrc;
1570
1571 if (Kind == TTI::SK_Broadcast) {
1572 // For Broadcasts we are splatting the first element from the first input
1573 // register, so only need to reference that input and all the output
1574 // registers are the same.
1575 LT.first = 1;
1576
1577 // If we're broadcasting a load then AVX/AVX2 can do this for free.
1578 using namespace PatternMatch;
1579 if (!Args.empty() && match(Args[0], m_OneUse(m_Load(m_Value()))) &&
1580 (ST->hasAVX2() ||
1581 (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32)))
1582 return TTI::TCC_Free;
1583 }
1584
1585 // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
1586 // permutation.
1587 // Attempt to detect a shuffle mask with a single defined element.
1588 bool IsInLaneShuffle = false;
1589 bool IsSingleElementMask = false;
1590 if (SrcTy->getPrimitiveSizeInBits() > 0 &&
1591 (SrcTy->getPrimitiveSizeInBits() % 128) == 0 &&
1592 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
1593 Mask.size() == SrcTy->getElementCount().getKnownMinValue()) {
1594 unsigned NumLanes = SrcTy->getPrimitiveSizeInBits() / 128;
1595 unsigned NumEltsPerLane = Mask.size() / NumLanes;
1596 if ((Mask.size() % NumLanes) == 0) {
1597 IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
1598 return P.value() == PoisonMaskElem ||
1599 ((P.value() % Mask.size()) / NumEltsPerLane) ==
1600 (P.index() / NumEltsPerLane);
1601 });
1602 IsSingleElementMask =
1603 (Mask.size() - 1) == static_cast<unsigned>(count_if(Mask, [](int M) {
1604 return M == PoisonMaskElem;
1605 }));
1606 }
1607 }
1608
1609 // Treat <X x bfloat> shuffles as <X x half>.
1610 if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16)
1611 LT.second = LT.second.changeVectorElementType(MVT::f16);
1612
1613 // Subvector extractions are free if they start at the beginning of a
1614 // vector and cheap if the subvectors are aligned.
1615 if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
1616 int NumElts = LT.second.getVectorNumElements();
1617 if ((Index % NumElts) == 0)
1618 return TTI::TCC_Free;
1619 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1620 if (SubLT.second.isVector()) {
1621 int NumSubElts = SubLT.second.getVectorNumElements();
1622 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1623 return SubLT.first;
1624 // Handle some cases for widening legalization. For now we only handle
1625 // cases where the original subvector was naturally aligned and evenly
1626 // fit in its legalized subvector type.
1627 // FIXME: Remove some of the alignment restrictions.
1628 // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
1629 // vectors.
1630 int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
1631 if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
1632 (NumSubElts % OrigSubElts) == 0 &&
1633 LT.second.getVectorElementType() ==
1634 SubLT.second.getVectorElementType() &&
1635 LT.second.getVectorElementType().getSizeInBits() ==
1636 SrcTy->getElementType()->getPrimitiveSizeInBits()) {
1637 assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
1638 "Unexpected number of elements!");
1639 auto *VecTy = FixedVectorType::get(SrcTy->getElementType(),
1640 LT.second.getVectorNumElements());
1641 auto *SubTy = FixedVectorType::get(SrcTy->getElementType(),
1642 SubLT.second.getVectorNumElements());
1643 int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
1644 InstructionCost ExtractCost =
1646 ExtractIndex, SubTy);
1647
1648 // If the original size is 32-bits or more, we can use pshufd. Otherwise
1649 // if we have SSSE3 we can use pshufb.
1650 if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
1651 return ExtractCost + 1; // pshufd or pshufb
1652
1653 assert(SubTp->getPrimitiveSizeInBits() == 16 &&
1654 "Unexpected vector size");
1655
1656 return ExtractCost + 2; // worst case pshufhw + pshufd
1657 }
1658 }
1659 // If the extract subvector is not optimal, treat it as single op shuffle.
1661 }
1662
1663 // Subvector insertions are cheap if the subvectors are aligned.
1664 // Note that in general, the insertion starting at the beginning of a vector
1665 // isn't free, because we need to preserve the rest of the wide vector,
1666 // but if the destination vector legalizes to the same width as the subvector
1667 // then the insertion will simplify to a (free) register copy.
1668 if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
1669 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(DstTy);
1670 int NumElts = DstLT.second.getVectorNumElements();
1671 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
1672 if (SubLT.second.isVector()) {
1673 int NumSubElts = SubLT.second.getVectorNumElements();
1674 bool MatchingTypes =
1675 NumElts == NumSubElts &&
1676 (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0;
1677 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
1678 return MatchingTypes ? TTI::TCC_Free : SubLT.first;
1679 }
1680
1681 // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have
1682 // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of
1683 // v1f32 (legalised to f32) into a v4f32.
1684 if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 &&
1685 SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41()))
1686 return 1;
1687
1688 // If the insertion is the lowest subvector then it will be blended
1689 // otherwise treat it like a 2-op shuffle.
1690 Kind =
1691 (Index == 0 && LT.first == 1) ? TTI::SK_Select : TTI::SK_PermuteTwoSrc;
1692 }
1693
1694 // Handle some common (illegal) sub-vector types as they are often very cheap
1695 // to shuffle even on targets without PSHUFB.
1696 EVT VT = TLI->getValueType(DL, SrcTy);
1697 if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
1698 !ST->hasSSSE3()) {
1699 static const CostKindTblEntry SSE2SubVectorShuffleTbl[] = {
1700 {TTI::SK_Broadcast, MVT::v4i16, {1,1,1,1}}, // pshuflw
1701 {TTI::SK_Broadcast, MVT::v2i16, {1,1,1,1}}, // pshuflw
1702 {TTI::SK_Broadcast, MVT::v8i8, {2,2,2,2}}, // punpck/pshuflw
1703 {TTI::SK_Broadcast, MVT::v4i8, {2,2,2,2}}, // punpck/pshuflw
1704 {TTI::SK_Broadcast, MVT::v2i8, {1,1,1,1}}, // punpck
1705
1706 {TTI::SK_Reverse, MVT::v4i16, {1,1,1,1}}, // pshuflw
1707 {TTI::SK_Reverse, MVT::v2i16, {1,1,1,1}}, // pshuflw
1708 {TTI::SK_Reverse, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw/packus
1709 {TTI::SK_Reverse, MVT::v2i8, {1,1,1,1}}, // punpck
1710
1711 {TTI::SK_Splice, MVT::v4i16, {2,2,2,2}}, // punpck+psrldq
1712 {TTI::SK_Splice, MVT::v2i16, {2,2,2,2}}, // punpck+psrldq
1713 {TTI::SK_Splice, MVT::v4i8, {2,2,2,2}}, // punpck+psrldq
1714 {TTI::SK_Splice, MVT::v2i8, {2,2,2,2}}, // punpck+psrldq
1715
1716 {TTI::SK_PermuteTwoSrc, MVT::v4i16, {2,2,2,2}}, // punpck/pshuflw
1717 {TTI::SK_PermuteTwoSrc, MVT::v2i16, {2,2,2,2}}, // punpck/pshuflw
1718 {TTI::SK_PermuteTwoSrc, MVT::v8i8, {7,7,7,7}}, // punpck/pshuflw
1719 {TTI::SK_PermuteTwoSrc, MVT::v4i8, {4,4,4,4}}, // punpck/pshuflw
1720 {TTI::SK_PermuteTwoSrc, MVT::v2i8, {2,2,2,2}}, // punpck
1721
1722 {TTI::SK_PermuteSingleSrc, MVT::v4i16, {1,1,1,1}}, // pshuflw
1723 {TTI::SK_PermuteSingleSrc, MVT::v2i16, {1,1,1,1}}, // pshuflw
1724 {TTI::SK_PermuteSingleSrc, MVT::v8i8, {5,5,5,5}}, // punpck/pshuflw
1725 {TTI::SK_PermuteSingleSrc, MVT::v4i8, {3,3,3,3}}, // punpck/pshuflw
1726 {TTI::SK_PermuteSingleSrc, MVT::v2i8, {1,1,1,1}}, // punpck
1727 };
1728
1729 if (ST->hasSSE2())
1730 if (const auto *Entry =
1731 CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
1732 if (auto KindCost = Entry->Cost[CostKind])
1733 return LT.first * *KindCost;
1734 }
1735
1736 // We are going to permute multiple sources and the result will be in multiple
1737 // destinations. Providing an accurate cost only for splits where the element
1738 // type remains the same.
1739 if (LT.first != 1) {
1740 MVT LegalVT = LT.second;
1741 if (LegalVT.isVector() &&
1742 LegalVT.getVectorElementType().getSizeInBits() ==
1743 SrcTy->getElementType()->getPrimitiveSizeInBits() &&
1744 LegalVT.getVectorNumElements() <
1745 cast<FixedVectorType>(SrcTy)->getNumElements()) {
1746 unsigned VecTySize = DL.getTypeStoreSize(SrcTy);
1747 unsigned LegalVTSize = LegalVT.getStoreSize();
1748 // Number of source vectors after legalization:
1749 unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
1750 // Number of destination vectors after legalization:
1751 InstructionCost NumOfDests = LT.first;
1752
1753 auto *SingleOpTy = FixedVectorType::get(SrcTy->getElementType(),
1754 LegalVT.getVectorNumElements());
1755
1756 if (!Mask.empty() && NumOfDests.isValid()) {
1757 // Try to perform better estimation of the permutation.
1758 // 1. Split the source/destination vectors into real registers.
1759 // 2. Do the mask analysis to identify which real registers are
1760 // permuted. If more than 1 source registers are used for the
1761 // destination register building, the cost for this destination register
1762 // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
1763 // source register is used, build mask and calculate the cost as a cost
1764 // of PermuteSingleSrc.
1765 // Also, for the single register permute we try to identify if the
1766 // destination register is just a copy of the source register or the
1767 // copy of the previous destination register (the cost is
1768 // TTI::TCC_Basic). If the source register is just reused, the cost for
1769 // this operation is TTI::TCC_Free.
1770 NumOfDests =
1772 FixedVectorType::get(SrcTy->getElementType(), Mask.size()))
1773 .first;
1774 unsigned E = NumOfDests.getValue();
1775 unsigned NormalizedVF =
1776 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
1777 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
1778 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
1779 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
1780 copy(Mask, NormalizedMask.begin());
1781 unsigned PrevSrcReg = 0;
1782 ArrayRef<int> PrevRegMask;
1785 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
1786 [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
1787 &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
1788 if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
1789 // Check if the previous register can be just copied to the next
1790 // one.
1791 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
1792 PrevRegMask != RegMask)
1793 Cost +=
1795 SingleOpTy, RegMask, CostKind, 0, nullptr);
1796 else
1797 // Just a copy of previous destination register.
1799 return;
1800 }
1801 if (SrcReg != DestReg &&
1802 any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
1803 // Just a copy of the source register.
1805 }
1806 PrevSrcReg = SrcReg;
1807 PrevRegMask = RegMask;
1808 },
1809 [this, SingleOpTy, CostKind,
1810 &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/,
1811 unsigned /*Unused*/, bool /*Unused*/) {
1813 SingleOpTy, RegMask, CostKind, 0, nullptr);
1814 });
1815 return Cost;
1816 }
1817
1818 InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
1819 return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
1820 SingleOpTy, {}, CostKind, 0,
1821 nullptr);
1822 }
1823
1824 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1825 SubTp);
1826 }
1827
1828 // If we're just moving a single element around (probably as an alternative to
1829 // extracting it), we can assume this is cheap.
1830 if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask)
1831 return TTI::TCC_Basic;
1832
1833 static const CostKindTblEntry AVX512VBMIShuffleTbl[] = {
1834 { TTI::SK_Reverse, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1835 { TTI::SK_Reverse, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1836 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
1837 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
1838 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 2, 2, 2, 2 } }, // vpermt2b
1839 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // vpermt2b
1840 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 2, 2, 2, 2 } } // vpermt2b
1841 };
1842
1843 if (ST->hasVBMI())
1844 if (const auto *Entry =
1845 CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
1846 if (auto KindCost = Entry->Cost[CostKind])
1847 return LT.first * *KindCost;
1848
1849 static const CostKindTblEntry AVX512BWShuffleTbl[] = {
1850 { TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1851 { TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1852 { TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1853
1854 { TTI::SK_Reverse, MVT::v32i16, { 2, 6, 2, 4 } }, // vpermw
1855 { TTI::SK_Reverse, MVT::v32f16, { 2, 6, 2, 4 } }, // vpermw
1856 { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1857 { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1858 { TTI::SK_Reverse, MVT::v64i8, { 2, 9, 2, 3 } }, // pshufb + vshufi64x2
1859
1860 { TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
1861 { TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
1862 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
1863 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
1864 { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 8, 8, 8, 8 } }, // extend to v32i16
1865
1866 { TTI::SK_PermuteTwoSrc, MVT::v32i16,{ 2, 2, 2, 2 } }, // vpermt2w
1867 { TTI::SK_PermuteTwoSrc, MVT::v32f16,{ 2, 2, 2, 2 } }, // vpermt2w
1868 { TTI::SK_PermuteTwoSrc, MVT::v16i16,{ 2, 2, 2, 2 } }, // vpermt2w
1869 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 2, 2, 2, 2 } }, // vpermt2w
1870 { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 19, 19, 19, 19 } }, // 6 * v32i8 + 1
1871
1872 { TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vblendmw
1873 { TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vblendmb
1874
1875 { TTI::SK_Splice, MVT::v32i16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1876 { TTI::SK_Splice, MVT::v32f16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1877 { TTI::SK_Splice, MVT::v64i8, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
1878 };
1879
1880 if (ST->hasBWI())
1881 if (const auto *Entry =
1882 CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
1883 if (auto KindCost = Entry->Cost[CostKind])
1884 return LT.first * *KindCost;
1885
1886 static const CostKindTblEntry AVX512ShuffleTbl[] = {
1887 {TTI::SK_Broadcast, MVT::v8f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1888 {TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 1 } }, // vbroadcastsd
1889 {TTI::SK_Broadcast, MVT::v16f32, { 1, 3, 1, 1 } }, // vbroadcastss
1890 {TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 1 } }, // vbroadcastss
1891 {TTI::SK_Broadcast, MVT::v8i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1892 {TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 1 } }, // vpbroadcastq
1893 {TTI::SK_Broadcast, MVT::v16i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1894 {TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 1 } }, // vpbroadcastd
1895 {TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1896 {TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1897 {TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1898 {TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 1 } }, // vpbroadcastw
1899 {TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
1900 {TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 1 }}, // vpbroadcastb
1901
1902 {TTI::SK_Reverse, MVT::v8f64, { 1, 5, 2, 3 } }, // vpermpd
1903 {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 2, 3 } }, // vpermps
1904 {TTI::SK_Reverse, MVT::v8i64, { 1, 5, 2, 3 } }, // vpermq
1905 {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 2, 3 } }, // vpermd
1906 {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
1907 {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
1908 {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
1909
1910 {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
1911 {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
1912 {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
1913 {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
1914 {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
1915 {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
1916 {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
1917 {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
1918 {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
1919 {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
1920 {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
1921
1922 {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
1923 {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
1924 {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
1925 {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
1926 {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
1927 {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
1928 {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
1929 {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
1930 {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
1931 {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
1932 {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
1933 {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
1934 {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
1935
1936 {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
1937 {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
1938 {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
1939 {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
1940 {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
1941 {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
1942 {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
1943 {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
1944 {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
1945 {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
1946 {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
1947 {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
1948
1949 // FIXME: This just applies the type legalization cost rules above
1950 // assuming these completely split.
1951 {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
1952 {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
1953 {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
1954 {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
1955 {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
1956 {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
1957
1958 {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
1959 {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
1960 {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
1961 {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
1962 {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
1963 {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
1964 {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
1965 };
1966
1967 if (ST->hasAVX512())
1968 if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
1969 if (auto KindCost = Entry->Cost[CostKind])
1970 return LT.first * *KindCost;
1971
1972 static const CostKindTblEntry AVX2InLaneShuffleTbl[] = {
1973 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 1, 1, 1, 1 } }, // vpshufb
1974 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 1, 1, 1, 1 } }, // vpshufb
1975 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpshufb
1976
1977 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
1978 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
1979 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
1980 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
1981 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1982 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1983 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
1984 };
1985
1986 if (IsInLaneShuffle && ST->hasAVX2())
1987 if (const auto *Entry =
1988 CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second))
1989 if (auto KindCost = Entry->Cost[CostKind])
1990 return LT.first * *KindCost;
1991
1992 static const CostKindTblEntry AVX2ShuffleTbl[] = {
1993 { TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 2 } }, // vbroadcastpd
1994 { TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 2 } }, // vbroadcastps
1995 { TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 2 } }, // vpbroadcastq
1996 { TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 2 } }, // vpbroadcastd
1997 { TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 2 } }, // vpbroadcastw
1998 { TTI::SK_Broadcast, MVT::v8i16, { 1, 3, 1, 1 } }, // vpbroadcastw
1999 { TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 2 } }, // vpbroadcastw
2000 { TTI::SK_Broadcast, MVT::v8f16, { 1, 3, 1, 1 } }, // vpbroadcastw
2001 { TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 2 } }, // vpbroadcastb
2002 { TTI::SK_Broadcast, MVT::v16i8, { 1, 3, 1, 1 } }, // vpbroadcastb
2003
2004 { TTI::SK_Reverse, MVT::v4f64, { 1, 6, 1, 2 } }, // vpermpd
2005 { TTI::SK_Reverse, MVT::v8f32, { 2, 7, 2, 4 } }, // vpermps
2006 { TTI::SK_Reverse, MVT::v4i64, { 1, 6, 1, 2 } }, // vpermq
2007 { TTI::SK_Reverse, MVT::v8i32, { 2, 7, 2, 4 } }, // vpermd
2008 { TTI::SK_Reverse, MVT::v16i16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2009 { TTI::SK_Reverse, MVT::v16f16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2010 { TTI::SK_Reverse, MVT::v32i8, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
2011
2012 { TTI::SK_Select, MVT::v16i16, { 1, 1, 1, 1 } }, // vpblendvb
2013 { TTI::SK_Select, MVT::v16f16, { 1, 1, 1, 1 } }, // vpblendvb
2014 { TTI::SK_Select, MVT::v32i8, { 1, 1, 1, 1 } }, // vpblendvb
2015
2016 { TTI::SK_Splice, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2017 { TTI::SK_Splice, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2018 { TTI::SK_Splice, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2019 { TTI::SK_Splice, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2020 { TTI::SK_Splice, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
2021
2022 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd
2023 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps
2024 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq
2025 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd
2026 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } },
2027 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } },
2028 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } },
2029
2030 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 3, 3, 3, 3 } }, // 2*vpermpd + vblendpd
2031 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 3, 3, 3, 3 } }, // 2*vpermps + vblendps
2032 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 3, 3, 3, 3 } }, // 2*vpermq + vpblendd
2033 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 3, 3, 3, 3 } }, // 2*vpermd + vpblendd
2034 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 7, 7, 7, 7 } },
2035 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 7, 7, 7, 7 } },
2036 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 7, 7, 7, 7 } },
2037 };
2038
2039 if (ST->hasAVX2())
2040 if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
2041 if (auto KindCost = Entry->Cost[CostKind])
2042 return LT.first * *KindCost;
2043
2044 static const CostKindTblEntry XOPShuffleTbl[] = {
2045 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2046 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2047 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
2048 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
2049 { TTI::SK_PermuteSingleSrc, MVT::v16i16,{ 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2050 // + vinsertf128
2051 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
2052 // + vinsertf128
2053
2054 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2055 // + vinsertf128
2056
2057 { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 1, 1, 1, 1 } }, // vpperm
2058 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
2059 // + vinsertf128
2060 { TTI::SK_PermuteTwoSrc, MVT::v16i8, { 1, 1, 1, 1 } }, // vpperm
2061 };
2062
2063 if (ST->hasXOP())
2064 if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
2065 if (auto KindCost = Entry->Cost[CostKind])
2066 return LT.first * *KindCost;
2067
2068 static const CostKindTblEntry AVX1InLaneShuffleTbl[] = {
2069 { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermilpd
2070 { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermilpd
2071 { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermilps
2072 { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermilps
2073
2074 { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2075 // + vpor + vinsertf128
2076 { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2077 // + vpor + vinsertf128
2078 { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
2079 // + vpor + vinsertf128
2080
2081 { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
2082 { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
2083 { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // 2*vpermilpd + vblendpd
2084 { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // 2*vpermilps + vblendps
2085 { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2086 // + 2*vpor + vinsertf128
2087 { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2088 // + 2*vpor + vinsertf128
2089 { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
2090 // + 2*vpor + vinsertf128
2091 };
2092
2093 if (IsInLaneShuffle && ST->hasAVX())
2094 if (const auto *Entry =
2095 CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second))
2096 if (auto KindCost = Entry->Cost[CostKind])
2097 return LT.first * *KindCost;
2098
2099 static const CostKindTblEntry AVX1ShuffleTbl[] = {
2100 {TTI::SK_Broadcast, MVT::v4f64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2101 {TTI::SK_Broadcast, MVT::v8f32, {2,3,2,3}}, // vperm2f128 + vpermilps
2102 {TTI::SK_Broadcast, MVT::v4i64, {2,3,2,3}}, // vperm2f128 + vpermilpd
2103 {TTI::SK_Broadcast, MVT::v8i32, {2,3,2,3}}, // vperm2f128 + vpermilps
2104 {TTI::SK_Broadcast, MVT::v16i16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2105 {TTI::SK_Broadcast, MVT::v16f16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
2106 {TTI::SK_Broadcast, MVT::v32i8, {3,4,3,6}}, // vpshufb + vinsertf128
2107
2108 {TTI::SK_Reverse, MVT::v4f64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2109 {TTI::SK_Reverse, MVT::v8f32, {2,7,2,4}}, // vperm2f128 + vpermilps
2110 {TTI::SK_Reverse, MVT::v4i64, {2,6,2,2}}, // vperm2f128 + vpermilpd
2111 {TTI::SK_Reverse, MVT::v8i32, {2,7,2,4}}, // vperm2f128 + vpermilps
2112 {TTI::SK_Reverse, MVT::v16i16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2113 // + vinsertf128
2114 {TTI::SK_Reverse, MVT::v16f16, {2,9,5,5}}, // vextractf128 + 2*pshufb
2115 // + vinsertf128
2116 {TTI::SK_Reverse, MVT::v32i8, {2,9,5,5}}, // vextractf128 + 2*pshufb
2117 // + vinsertf128
2118
2119 {TTI::SK_Select, MVT::v4i64, {1,1,1,1}}, // vblendpd
2120 {TTI::SK_Select, MVT::v4f64, {1,1,1,1}}, // vblendpd
2121 {TTI::SK_Select, MVT::v8i32, {1,1,1,1}}, // vblendps
2122 {TTI::SK_Select, MVT::v8f32, {1,1,1,1}}, // vblendps
2123 {TTI::SK_Select, MVT::v16i16, {3,3,3,3}}, // vpand + vpandn + vpor
2124 {TTI::SK_Select, MVT::v16f16, {3,3,3,3}}, // vpand + vpandn + vpor
2125 {TTI::SK_Select, MVT::v32i8, {3,3,3,3}}, // vpand + vpandn + vpor
2126
2127 {TTI::SK_Splice, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + shufpd
2128 {TTI::SK_Splice, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + shufpd
2129 {TTI::SK_Splice, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2130 {TTI::SK_Splice, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2131 {TTI::SK_Splice, MVT::v16i16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2132 {TTI::SK_Splice, MVT::v16f16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2133 {TTI::SK_Splice, MVT::v32i8, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
2134
2135 {TTI::SK_PermuteSingleSrc, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vshufpd
2136 {TTI::SK_PermuteSingleSrc, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vshufpd
2137 {TTI::SK_PermuteSingleSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2138 {TTI::SK_PermuteSingleSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2139 {TTI::SK_PermuteSingleSrc, MVT::v16i16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2140 // + 2*por + vinsertf128
2141 {TTI::SK_PermuteSingleSrc, MVT::v16f16,{8,8,8,8}}, // vextractf128 + 4*pshufb
2142 // + 2*por + vinsertf128
2143 {TTI::SK_PermuteSingleSrc, MVT::v32i8, {8,8,8,8}}, // vextractf128 + 4*pshufb
2144 // + 2*por + vinsertf128
2145
2146 {TTI::SK_PermuteTwoSrc, MVT::v4f64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2147 {TTI::SK_PermuteTwoSrc, MVT::v4i64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
2148 {TTI::SK_PermuteTwoSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2149 {TTI::SK_PermuteTwoSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
2150 {TTI::SK_PermuteTwoSrc, MVT::v16i16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2151 // + 4*por + vinsertf128
2152 {TTI::SK_PermuteTwoSrc, MVT::v16f16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2153 // + 4*por + vinsertf128
2154 {TTI::SK_PermuteTwoSrc, MVT::v32i8, {15,15,15,15}}, // 2*vextractf128 + 8*pshufb
2155 // + 4*por + vinsertf128
2156 };
2157
2158 if (ST->hasAVX())
2159 if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
2160 if (auto KindCost = Entry->Cost[CostKind])
2161 return LT.first * *KindCost;
2162
2163 static const CostKindTblEntry SSE41ShuffleTbl[] = {
2164 {TTI::SK_Select, MVT::v2i64, {1,1,1,1}}, // pblendw
2165 {TTI::SK_Select, MVT::v2f64, {1,1,1,1}}, // movsd
2166 {TTI::SK_Select, MVT::v4i32, {1,1,1,1}}, // pblendw
2167 {TTI::SK_Select, MVT::v4f32, {1,1,1,1}}, // blendps
2168 {TTI::SK_Select, MVT::v8i16, {1,1,1,1}}, // pblendw
2169 {TTI::SK_Select, MVT::v8f16, {1,1,1,1}}, // pblendw
2170 {TTI::SK_Select, MVT::v16i8, {1,1,1,1}} // pblendvb
2171 };
2172
2173 if (ST->hasSSE41())
2174 if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
2175 if (auto KindCost = Entry->Cost[CostKind])
2176 return LT.first * *KindCost;
2177
2178 static const CostKindTblEntry SSSE3ShuffleTbl[] = {
2179 {TTI::SK_Broadcast, MVT::v8i16, {1, 3, 2, 2}}, // pshufb
2180 {TTI::SK_Broadcast, MVT::v8f16, {1, 3, 2, 2}}, // pshufb
2181 {TTI::SK_Broadcast, MVT::v16i8, {1, 3, 2, 2}}, // pshufb
2182
2183 {TTI::SK_Reverse, MVT::v8i16, {1, 2, 1, 2}}, // pshufb
2184 {TTI::SK_Reverse, MVT::v8f16, {1, 2, 1, 2}}, // pshufb
2185 {TTI::SK_Reverse, MVT::v16i8, {1, 2, 1, 2}}, // pshufb
2186
2187 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2188 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2189 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2190
2191 {TTI::SK_Splice, MVT::v4i32, {1, 1, 1, 1}}, // palignr
2192 {TTI::SK_Splice, MVT::v4f32, {1, 1, 1, 1}}, // palignr
2193 {TTI::SK_Splice, MVT::v8i16, {1, 1, 1, 1}}, // palignr
2194 {TTI::SK_Splice, MVT::v8f16, {1, 1, 1, 1}}, // palignr
2195 {TTI::SK_Splice, MVT::v16i8, {1, 1, 1, 1}}, // palignr
2196
2197 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
2198 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
2199 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
2200
2201 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
2202 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
2203 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
2204 };
2205
2206 if (ST->hasSSSE3())
2207 if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
2208 if (auto KindCost = Entry->Cost[CostKind])
2209 return LT.first * *KindCost;
2210
2211 static const CostKindTblEntry SSE2ShuffleTbl[] = {
2212 {TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2213 {TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2214 {TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2215 {TTI::SK_Broadcast, MVT::v8i16, {1, 2, 2, 2}}, // pshuflw + pshufd
2216 {TTI::SK_Broadcast, MVT::v8f16, {1, 2, 2, 2}}, // pshuflw + pshufd
2217 {TTI::SK_Broadcast, MVT::v16i8, {2, 3, 3, 4}}, // unpck + pshuflw + pshufd
2218
2219 {TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2220 {TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2221 {TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2222 {TTI::SK_Reverse, MVT::v8i16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2223 {TTI::SK_Reverse, MVT::v8f16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
2224 {TTI::SK_Reverse, MVT::v16i8, {5, 6,11,11}}, // 2*pshuflw + 2*pshufhw
2225 // + 2*pshufd + 2*unpck + packus
2226
2227 {TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd
2228 {TTI::SK_Select, MVT::v2f64, {1, 1, 1, 1}}, // movsd
2229 {TTI::SK_Select, MVT::v4i32, {2, 2, 2, 2}}, // 2*shufps
2230 {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // pand + pandn + por
2231 {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // pand + pandn + por
2232 {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // pand + pandn + por
2233
2234 {TTI::SK_Splice, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2235 {TTI::SK_Splice, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2236 {TTI::SK_Splice, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2237 {TTI::SK_Splice, MVT::v8i16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2238 {TTI::SK_Splice, MVT::v8f16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2239 {TTI::SK_Splice, MVT::v16i8, {3, 3, 3, 3}}, // psrldq + psrlldq + por
2240
2241 {TTI::SK_PermuteSingleSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2242 {TTI::SK_PermuteSingleSrc, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
2243 {TTI::SK_PermuteSingleSrc, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
2244 {TTI::SK_PermuteSingleSrc, MVT::v8i16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2245 // + pshufd/unpck
2246 {TTI::SK_PermuteSingleSrc, MVT::v8f16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
2247 // + pshufd/unpck
2248 {TTI::SK_PermuteSingleSrc, MVT::v16i8, {8, 10, 10, 10}}, // 2*pshuflw + 2*pshufhw
2249 // + 2*pshufd + 2*unpck + 2*packus
2250
2251 {TTI::SK_PermuteTwoSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
2252 {TTI::SK_PermuteTwoSrc, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
2253 {TTI::SK_PermuteTwoSrc, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
2254 {TTI::SK_PermuteTwoSrc, MVT::v8i16, {6, 8, 8, 8}}, // blend+permute
2255 {TTI::SK_PermuteTwoSrc, MVT::v8f16, {6, 8, 8, 8}}, // blend+permute
2256 {TTI::SK_PermuteTwoSrc, MVT::v16i8, {11, 13, 13, 13}}, // blend+permute
2257 };
2258
2259 static const CostTblEntry SSE3BroadcastLoadTbl[] = {
2260 {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup
2261 };
2262
2263 if (ST->hasSSE2()) {
2264 bool IsLoad =
2265 llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); });
2266 if (ST->hasSSE3() && IsLoad)
2267 if (const auto *Entry =
2268 CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) {
2269 assert(isLegalBroadcastLoad(SrcTy->getElementType(),
2270 LT.second.getVectorElementCount()) &&
2271 "Table entry missing from isLegalBroadcastLoad()");
2272 return LT.first * Entry->Cost;
2273 }
2274
2275 if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
2276 if (auto KindCost = Entry->Cost[CostKind])
2277 return LT.first * *KindCost;
2278 }
2279
2280 static const CostKindTblEntry SSE1ShuffleTbl[] = {
2281 { TTI::SK_Broadcast, MVT::v4f32, {1,1,1,1} }, // shufps
2282 { TTI::SK_Reverse, MVT::v4f32, {1,1,1,1} }, // shufps
2283 { TTI::SK_Select, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2284 { TTI::SK_Splice, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2285 { TTI::SK_PermuteSingleSrc, MVT::v4f32, {1,1,1,1} }, // shufps
2286 { TTI::SK_PermuteTwoSrc, MVT::v4f32, {2,2,2,2} }, // 2*shufps
2287 };
2288
2289 if (ST->hasSSE1()) {
2290 if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) {
2291 // SHUFPS: both pairs must come from the same source register.
2292 auto MatchSHUFPS = [](int X, int Y) {
2293 return X < 0 || Y < 0 || ((X & 4) == (Y & 4));
2294 };
2295 if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3]))
2296 return 1;
2297 }
2298 if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
2299 if (auto KindCost = Entry->Cost[CostKind])
2300 return LT.first * *KindCost;
2301 }
2302
2303 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
2304 SubTp);
2305}
2306
2308 Type *Src,
2311 const Instruction *I) const {
2312 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2313 assert(ISD && "Invalid opcode");
2314
2315 // The cost tables include both specific, custom (non-legal) src/dst type
2316 // conversions and generic, legalized types. We test for customs first, before
2317 // falling back to legalization.
2318 // FIXME: Need a better design of the cost table to handle non-simple types of
2319 // potential massive combinations (elem_num x src_type x dst_type).
2320 static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{
2321 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2322 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 1, 1, 1, 1 } },
2323
2324 // Mask sign extend has an instruction.
2325 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2326 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2327 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2328 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2329 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2330 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2331 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2332 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2333 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2334 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2335 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2336 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2337 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2338 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2339 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, { 1, 1, 1, 1 } },
2340 { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, { 1, 1, 1, 1 } },
2341 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, { 1, 1, 1, 1 } },
2342
2343 // Mask zero extend is a sext + shift.
2344 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2345 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2346 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2347 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2348 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2349 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2350 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2351 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2352 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2353 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2354 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2355 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2356 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2357 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2358 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, { 2, 1, 1, 1 } },
2359 { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, { 2, 1, 1, 1 } },
2360 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, { 2, 1, 1, 1 } },
2361
2362 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2363 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2364 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2365 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2366 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2367 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2368 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2369 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2370 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2371 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2372 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2373 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2374 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2375 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2376 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, { 2, 1, 1, 1 } },
2377 { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, { 2, 1, 1, 1 } },
2378 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, { 2, 1, 1, 1 } },
2379
2380 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 2, 1, 1, 1 } },
2381 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // widen to zmm
2382 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, { 2, 1, 1, 1 } }, // vpmovwb
2383 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, { 2, 1, 1, 1 } }, // vpmovwb
2384 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, { 2, 1, 1, 1 } }, // vpmovwb
2385 };
2386
2387 static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = {
2388 // Mask sign extend has an instruction.
2389 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2390 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2391 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2392 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2393 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2394 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, { 1, 1, 1, 1 } },
2395 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } },
2396 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } },
2397
2398 // Mask zero extend is a sext + shift.
2399 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1, } },
2400 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1, } },
2401 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1, } },
2402 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1, } },
2403 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1, } },
2404 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, { 2, 1, 1, 1, } },
2405 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1, } },
2406 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1, } },
2407
2408 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2409 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2410 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2411 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2412 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2413 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } },
2414 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } },
2415 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, { 2, 1, 1, 1 } },
2416
2417 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2418 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2419
2420 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, { 1, 1, 1, 1 } },
2421 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 1, 1, 1, 1 } },
2422
2423 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2424 { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2425
2426 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, { 1, 1, 1, 1 } },
2427 { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, { 1, 1, 1, 1 } },
2428 };
2429
2430 // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
2431 // 256-bit wide vectors.
2432
2433 static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = {
2434 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 1, 1, 1, 1 } },
2435 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, { 3, 1, 1, 1 } },
2436 { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, { 4, 1, 1, 1 } }, // 2*vcvtps2pd+vextractf64x4
2437 { ISD::FP_EXTEND, MVT::v16f32, MVT::v16f16, { 1, 1, 1, 1 } }, // vcvtph2ps
2438 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
2439 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 1, 1, 1, 1 } },
2440 { ISD::FP_ROUND, MVT::v16f16, MVT::v16f32, { 1, 1, 1, 1 } }, // vcvtps2ph
2441
2442 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2443 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2444 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2445 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2446 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2447 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2448 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2449 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2450 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2451 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2452 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // zmm vpslld+vptestmd
2453 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2454 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2455 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // zmm vpsllq+vptestmq
2456 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2457 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, { 2, 1, 1, 1 } }, // vpmovdb
2458 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, { 2, 1, 1, 1 } }, // vpmovdb
2459 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2460 { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2461 { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdb
2462 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2463 { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, { 2, 1, 1, 1 } }, // vpmovdw
2464 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, { 2, 1, 1, 1 } }, // vpmovqb
2465 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, { 1, 1, 1, 1 } }, // vpshufb
2466 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2467 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2468 { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2469 { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqb
2470 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2471 { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2472 { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, { 2, 1, 1, 1 } }, // vpmovqw
2473 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, { 1, 1, 1, 1 } }, // vpmovqd
2474 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // zmm vpmovqd
2475 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, { 5, 1, 1, 1 } },// 2*vpmovqd+concat+vpmovdb
2476
2477 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } }, // extend to v16i32
2478 { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, { 8, 1, 1, 1 } },
2479 { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, { 8, 1, 1, 1 } },
2480
2481 // Sign extend is zmm vpternlogd+vptruncdb.
2482 // Zero extend is zmm broadcast load+vptruncdw.
2483 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 3, 1, 1, 1 } },
2484 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 4, 1, 1, 1 } },
2485 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 3, 1, 1, 1 } },
2486 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 4, 1, 1, 1 } },
2487 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 3, 1, 1, 1 } },
2488 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 4, 1, 1, 1 } },
2489 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 3, 1, 1, 1 } },
2490 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 4, 1, 1, 1 } },
2491
2492 // Sign extend is zmm vpternlogd+vptruncdw.
2493 // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
2494 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 3, 1, 1, 1 } },
2495 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2496 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 3, 1, 1, 1 } },
2497 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2498 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 3, 1, 1, 1 } },
2499 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2500 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 3, 1, 1, 1 } },
2501 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2502
2503 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2504 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2505 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2506 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2507 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // zmm vpternlogd
2508 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // zmm vpternlogd+psrld
2509 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2510 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2511 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // zmm vpternlogq
2512 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // zmm vpternlogq+psrlq
2513
2514 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2515 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2516 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogq
2517 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2518
2519 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2520 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, { 1, 1, 1, 1 } },
2521 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2522 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 1, 1, 1, 1 } },
2523 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2524 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, { 1, 1, 1, 1 } },
2525 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2526 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, { 1, 1, 1, 1 } },
2527 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2528 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, { 1, 1, 1, 1 } },
2529
2530 { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2531 { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, { 3, 1, 1, 1 } }, // FIXME: May not be right
2532
2533 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2534 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2535 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2536 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2537 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2538 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2539 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2540 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2541
2542 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, { 4, 1, 1, 1 } },
2543 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, { 3, 1, 1, 1 } },
2544 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, { 2, 1, 1, 1 } },
2545 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, { 1, 1, 1, 1 } },
2546 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, { 2, 1, 1, 1 } },
2547 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, { 1, 1, 1, 1 } },
2548 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 1, 1, 1, 1 } },
2549 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, { 1, 1, 1, 1 } },
2550 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, {26, 1, 1, 1 } },
2551 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, { 5, 1, 1, 1 } },
2552
2553 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2554 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, { 7, 1, 1, 1 } },
2555 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64, {15, 1, 1, 1 } },
2556 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32, {11, 1, 1, 1 } },
2557 { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64, {31, 1, 1, 1 } },
2558 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2559 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, { 7, 1, 1, 1 } },
2560 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, { 5, 1, 1, 1 } },
2561 { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64, {15, 1, 1, 1 } },
2562 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2563 { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, { 3, 1, 1, 1 } },
2564
2565 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2566 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, { 3, 1, 1, 1 } },
2567 { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, { 3, 1, 1, 1 } },
2568 { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, { 1, 1, 1, 1 } },
2569 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, { 3, 1, 1, 1 } },
2570 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, { 3, 1, 1, 1 } },
2571 };
2572
2573 static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] {
2574 // Mask sign extend has an instruction.
2575 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 1, 1, 1, 1 } },
2576 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, { 1, 1, 1, 1 } },
2577 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 1, 1, 1, 1 } },
2578 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, { 1, 1, 1, 1 } },
2579 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 1, 1, 1, 1 } },
2580 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, { 1, 1, 1, 1 } },
2581 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 1, 1, 1, 1 } },
2582 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, { 1, 1, 1, 1 } },
2583 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 1, 1, 1, 1 } },
2584 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, { 1, 1, 1, 1 } },
2585 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 1, 1, 1, 1 } },
2586 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, { 1, 1, 1, 1 } },
2587 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2588 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, { 1, 1, 1, 1 } },
2589 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, { 1, 1, 1, 1 } },
2590 { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, { 1, 1, 1, 1 } },
2591 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, { 1, 1, 1, 1 } },
2592
2593 // Mask zero extend is a sext + shift.
2594 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 2, 1, 1, 1 } },
2595 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, { 2, 1, 1, 1 } },
2596 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 2, 1, 1, 1 } },
2597 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, { 2, 1, 1, 1 } },
2598 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 2, 1, 1, 1 } },
2599 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, { 2, 1, 1, 1 } },
2600 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 2, 1, 1, 1 } },
2601 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, { 2, 1, 1, 1 } },
2602 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 2, 1, 1, 1 } },
2603 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, { 2, 1, 1, 1 } },
2604 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 2, 1, 1, 1 } },
2605 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, { 2, 1, 1, 1 } },
2606 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 2, 1, 1, 1 } },
2607 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, { 2, 1, 1, 1 } },
2608 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, { 2, 1, 1, 1 } },
2609 { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, { 2, 1, 1, 1 } },
2610 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, { 2, 1, 1, 1 } },
2611
2612 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 2, 1, 1, 1 } },
2613 { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, { 2, 1, 1, 1 } },
2614 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } },
2615 { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, { 2, 1, 1, 1 } },
2616 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } },
2617 { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, { 2, 1, 1, 1 } },
2618 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 2, 1, 1, 1 } },
2619 { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, { 2, 1, 1, 1 } },
2620 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 2, 1, 1, 1 } },
2621 { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, { 2, 1, 1, 1 } },
2622 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 2, 1, 1, 1 } },
2623 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 2, 1, 1, 1 } },
2624 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 2, 1, 1, 1 } },
2625 { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, { 2, 1, 1, 1 } },
2626 { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, { 2, 1, 1, 1 } },
2627 { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, { 2, 1, 1, 1 } },
2628 { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, { 2, 1, 1, 1 } },
2629
2630 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } },
2631 };
2632
2633 static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = {
2634 // Mask sign extend has an instruction.
2635 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } },
2636 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, { 1, 1, 1, 1 } },
2637 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } },
2638 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, { 1, 1, 1, 1 } },
2639 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } },
2640 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, { 1, 1, 1, 1 } },
2641 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } },
2642 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } },
2643
2644 // Mask zero extend is a sext + shift.
2645 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } },
2646 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, { 2, 1, 1, 1 } },
2647 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } },
2648 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, { 2, 1, 1, 1 } },
2649 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } },
2650 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, { 2, 1, 1, 1 } },
2651 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } },
2652 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } },
2653
2654 { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, { 2, 1, 1, 1 } },
2655 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } },
2656 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } },
2657 { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, { 2, 1, 1, 1 } },
2658 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } },
2659 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } },
2660 { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, { 2, 1, 1, 1 } },
2661 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2662
2663 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2664 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2665 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2666 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2667
2668 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 1, 1, 1, 1 } },
2669 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 1, 1, 1, 1 } },
2670 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, { 1, 1, 1, 1 } },
2671 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 1, 1, 1, 1 } },
2672
2673 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2674 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2675 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2676 { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2677
2678 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, { 1, 1, 1, 1 } },
2679 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, { 1, 1, 1, 1 } },
2680 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, { 1, 1, 1, 1 } },
2681 { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, { 1, 1, 1, 1 } },
2682 };
2683
2684 static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = {
2685 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2686 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2687 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 3, 1, 1, 1 } }, // sext+vpslld+vptestmd
2688 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, { 8, 1, 1, 1 } }, // split+2*v8i8
2689 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2690 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2691 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, { 3, 1, 1, 1 } }, // sext+vpsllq+vptestmq
2692 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 8, 1, 1, 1 } }, // split+2*v8i16
2693 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2694 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2695 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2696 { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, { 2, 1, 1, 1 } }, // vpslld+vptestmd
2697 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2698 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 2, 1, 1, 1 } }, // vpsllq+vptestmq
2699 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } }, // vpmovqd
2700 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqb
2701 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, { 2, 1, 1, 1 } }, // vpmovqw
2702 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, { 2, 1, 1, 1 } }, // vpmovwb
2703
2704 // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
2705 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
2706 { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, { 5, 1, 1, 1 } },
2707 { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, { 6, 1, 1, 1 } },
2708 { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, { 5, 1, 1, 1 } },
2709 { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, { 6, 1, 1, 1 } },
2710 { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, { 5, 1, 1, 1 } },
2711 { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, { 6, 1, 1, 1 } },
2712 { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, {10, 1, 1, 1 } },
2713 { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, {12, 1, 1, 1 } },
2714
2715 // sign extend is vpcmpeq+maskedmove+vpmovdw
2716 // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
2717 { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, { 4, 1, 1, 1 } },
2718 { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, { 5, 1, 1, 1 } },
2719 { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, { 4, 1, 1, 1 } },
2720 { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, { 5, 1, 1, 1 } },
2721 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, { 4, 1, 1, 1 } },
2722 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, { 5, 1, 1, 1 } },
2723 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, {10, 1, 1, 1 } },
2724 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, {12, 1, 1, 1 } },
2725
2726 { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogd
2727 { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2728 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogd
2729 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2730 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 1, 1, 1, 1 } }, // vpternlogd
2731 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2732 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, { 1, 1, 1, 1 } }, // vpternlogd
2733 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, { 2, 1, 1, 1 } }, // vpternlogd+psrld
2734
2735 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, { 1, 1, 1, 1 } }, // vpternlogq
2736 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2737 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 1, 1, 1, 1 } }, // vpternlogq
2738 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 2, 1, 1, 1 } }, // vpternlogq+psrlq
2739
2740 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2741 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 1, 1, 1, 1 } },
2742 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2743 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 1, 1, 1, 1 } },
2744 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2745 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 1, 1, 1, 1 } },
2746 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2747 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 1, 1, 1, 1 } },
2748 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2749 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 1, 1, 1, 1 } },
2750 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2751 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 1, 1, 1, 1 } },
2752
2753 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2754 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2755 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2756 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2757
2758 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2759 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2760 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2761 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 1, 1, 1, 1 } },
2762 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2763 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 1, 1, 1, 1 } },
2764 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 1, 1, 1, 1 } },
2765 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2766 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2767 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2768 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, { 5, 1, 1, 1 } },
2769 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2770 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, { 5, 1, 1, 1 } },
2771
2772 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2773 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, { 2, 1, 1, 1 } },
2774 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, { 5, 1, 1, 1 } },
2775
2776 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
2777 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
2778 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
2779 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
2780 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2781 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2782 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, { 1, 1, 1, 1 } },
2783 };
2784
2785 static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = {
2786 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2787 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 3, 1, 1, 1 } },
2788 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2789 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 3, 1, 1, 1 } },
2790 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2791 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 1, 1, 1, 1 } },
2792
2793 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2794 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 2, 1, 1, 1 } },
2795 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2796 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 2, 1, 1, 1 } },
2797 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2798 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 2, 1, 1, 1 } },
2799 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2800 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 2, 1, 1, 1 } },
2801 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2802 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 2, 1, 1, 1 } },
2803 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2804 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, { 3, 1, 1, 1 } },
2805 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2806 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 2, 1, 1, 1 } },
2807
2808 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 2, 1, 1, 1 } },
2809
2810 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 4, 1, 1, 1 } },
2811 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 4, 1, 1, 1 } },
2812 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 1, 1, 1, 1 } },
2813 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 1, 1, 1, 1 } },
2814 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 1, 1, 1, 1 } },
2815 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 4, 1, 1, 1 } },
2816 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 4, 1, 1, 1 } },
2817 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 1, 1, 1, 1 } },
2818 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 1, 1, 1, 1 } },
2819 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 5, 1, 1, 1 } },
2820 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 1, 1, 1, 1 } },
2821 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 2, 1, 1, 1 } },
2822
2823 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, { 3, 1, 1, 1 } },
2824 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, { 3, 1, 1, 1 } },
2825
2826 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2827 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 1, 1, 1, 1 } },
2828 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 1, 1, 1, 1 } },
2829 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 3, 1, 1, 1 } },
2830
2831 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 3, 1, 1, 1 } },
2832 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 3, 1, 1, 1 } },
2833 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 1, 1, 1, 1 } },
2834 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2835 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2836 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 4, 1, 1, 1 } },
2837 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 3, 1, 1, 1 } },
2838 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 4, 1, 1, 1 } },
2839
2840 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2841 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2842 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2843 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2844 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 1, 1, 1, 1 } },
2845 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 1, 1, 1, 1 } },
2846 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 3, 1, 1, 1 } },
2847
2848 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 2, 1, 1, 1 } },
2849 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 2, 1, 1, 1 } },
2850 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 2, 1, 1, 1 } },
2851 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 2, 1, 1, 1 } },
2852 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 2, 1, 1, 1 } },
2853 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 1, 1, 1, 1 } },
2854 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 2, 1, 1, 1 } },
2855 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2856 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2857 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2858 };
2859
2860 static const TypeConversionCostKindTblEntry AVXConversionTbl[] = {
2861 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2862 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, { 4, 1, 1, 1 } },
2863 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2864 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, { 4, 1, 1, 1 } },
2865 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2866 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, { 4, 1, 1, 1 } },
2867
2868 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2869 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, { 3, 1, 1, 1 } },
2870 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2871 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, { 3, 1, 1, 1 } },
2872 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2873 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, { 3, 1, 1, 1 } },
2874 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2875 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, { 3, 1, 1, 1 } },
2876 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2877 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, { 3, 1, 1, 1 } },
2878 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2879 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, { 3, 1, 1, 1 } },
2880
2881 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, { 4, 1, 1, 1 } },
2882 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, { 5, 1, 1, 1 } },
2883 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, { 4, 1, 1, 1 } },
2884 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, { 9, 1, 1, 1 } },
2885 { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, {11, 1, 1, 1 } },
2886
2887 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, { 6, 1, 1, 1 } },
2888 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 6, 1, 1, 1 } },
2889 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 2, 1, 1, 1 } }, // and+extract+packuswb
2890 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, { 5, 1, 1, 1 } },
2891 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
2892 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, { 5, 1, 1, 1 } },
2893 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, { 3, 1, 1, 1 } }, // and+extract+2*packusdw
2894 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, { 2, 1, 1, 1 } },
2895
2896 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, { 3, 1, 1, 1 } },
2897 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, { 3, 1, 1, 1 } },
2898 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, { 8, 1, 1, 1 } },
2899 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2900 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2901 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2902 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2903 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2904 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, { 2, 1, 1, 1 } },
2905 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, { 4, 1, 1, 1 } },
2906 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 5, 1, 1, 1 } },
2907 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, { 8, 1, 1, 1 } },
2908
2909 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, { 7, 1, 1, 1 } },
2910 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, { 7, 1, 1, 1 } },
2911 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, { 6, 1, 1, 1 } },
2912 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, { 4, 1, 1, 1 } },
2913 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, { 2, 1, 1, 1 } },
2914 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, { 4, 1, 1, 1 } },
2915 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, { 2, 1, 1, 1 } },
2916 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 4, 1, 1, 1 } },
2917 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, { 4, 1, 1, 1 } },
2918 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
2919 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, { 6, 1, 1, 1 } },
2920 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, { 8, 1, 1, 1 } },
2921 { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, {10, 1, 1, 1 } },
2922 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, {10, 1, 1, 1 } },
2923 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {18, 1, 1, 1 } },
2924 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 5, 1, 1, 1 } },
2925 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, {10, 1, 1, 1 } },
2926
2927 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2928 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2929 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2930 { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2931 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2932 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2933 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2934 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2935 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, { 2, 1, 1, 1 } },
2936 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, { 2, 1, 1, 1 } },
2937 { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, { 5, 1, 1, 1 } },
2938
2939 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, { 2, 1, 1, 1 } },
2940 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, { 2, 1, 1, 1 } },
2941 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, { 2, 1, 1, 1 } },
2942 { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, { 2, 1, 1, 1 } },
2943 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, { 2, 1, 1, 1 } },
2944 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, { 2, 1, 1, 1 } },
2945 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, { 2, 1, 1, 1 } },
2946 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, { 2, 1, 1, 1 } },
2947 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 3, 1, 1, 1 } },
2948 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
2949 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, { 6, 1, 1, 1 } },
2950 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, { 7, 1, 1, 1 } },
2951 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, { 7, 1, 1, 1 } },
2952
2953 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, { 1, 1, 1, 1 } },
2954 { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, { 1, 1, 1, 1 } },
2955 };
2956
2957 static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = {
2958 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2959 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 1, 1, 1, 1 } },
2960 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2961 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 1, 1, 1, 1 } },
2962 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2963 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
2964 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2965 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 1, 1, 1, 1 } },
2966 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2967 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
2968 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2969 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
2970
2971 // These truncates end up widening elements.
2972 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 1, 1, 1, 1 } }, // PMOVXZBQ
2973 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 1, 1, 1, 1 } }, // PMOVXZWQ
2974 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 1, 1, 1, 1 } }, // PMOVXZBD
2975
2976 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 2, 1, 1, 1 } },
2977 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 2, 1, 1, 1 } },
2978 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 2, 1, 1, 1 } },
2979
2980 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2981 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2982 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 1, 1, 1, 1 } },
2983 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 1, 1, 1, 1 } },
2984 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2985 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2986 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2987 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
2988 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 1, 1, 1, 1 } },
2989 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 1, 1, 1, 1 } },
2990 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, { 2, 1, 1, 1 } },
2991
2992 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 1, 1, 1, 1 } },
2993 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 1, 1, 1, 1 } },
2994 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 4, 1, 1, 1 } },
2995 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 4, 1, 1, 1 } },
2996 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 1, 1, 1, 1 } },
2997 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 1, 1, 1, 1 } },
2998 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 1, 1, 1, 1 } },
2999 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 1, 1, 1, 1 } },
3000 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 3, 1, 1, 1 } },
3001 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3002 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 2, 1, 1, 1 } },
3003 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {12, 1, 1, 1 } },
3004 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, {22, 1, 1, 1 } },
3005 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, { 4, 1, 1, 1 } },
3006
3007 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3008 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 1, 1, 1, 1 } },
3009 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3010 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 1, 1, 1, 1 } },
3011 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3012 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3013 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3014 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3015 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 1, 1, 1, 1 } },
3016 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 1, 1, 1, 1 } },
3017
3018 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 1, 1, 1, 1 } },
3019 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3020 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 1, 1, 1, 1 } },
3021 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3022 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 2, 1, 1, 1 } },
3023 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 2, 1, 1, 1 } },
3024 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 1, 1, 1, 1 } },
3025 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 1, 1, 1, 1 } },
3026 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3027 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3028 };
3029
3030 static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = {
3031 // These are somewhat magic numbers justified by comparing the
3032 // output of llvm-mca for our various supported scheduler models
3033 // and basing it off the worst case scenario.
3034 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3035 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3036 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, { 3, 1, 1, 1 } },
3037 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, { 3, 1, 1, 1 } },
3038 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, { 3, 1, 1, 1 } },
3039 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3040 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, { 3, 1, 1, 1 } },
3041 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3042 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, { 3, 1, 1, 1 } },
3043 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, { 4, 1, 1, 1 } },
3044 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, { 8, 1, 1, 1 } },
3045 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, { 8, 1, 1, 1 } },
3046
3047 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, { 3, 1, 1, 1 } },
3048 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, { 3, 1, 1, 1 } },
3049 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, { 8, 1, 1, 1 } },
3050 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, { 9, 1, 1, 1 } },
3051 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, { 4, 1, 1, 1 } },
3052 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, { 4, 1, 1, 1 } },
3053 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, { 4, 1, 1, 1 } },
3054 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, { 4, 1, 1, 1 } },
3055 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, { 7, 1, 1, 1 } },
3056 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, { 7, 1, 1, 1 } },
3057 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, { 5, 1, 1, 1 } },
3058 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, {15, 1, 1, 1 } },
3059 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, {18, 1, 1, 1 } },
3060
3061 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3062 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3063 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3064 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, { 4, 1, 1, 1 } },
3065 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3066 { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3067 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3068 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3069 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, { 4, 1, 1, 1 } },
3070 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, { 4, 1, 1, 1 } },
3071
3072 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, { 4, 1, 1, 1 } },
3073 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, { 4, 1, 1, 1 } },
3074 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, { 4, 1, 1, 1 } },
3075 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, {15, 1, 1, 1 } },
3076 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, { 6, 1, 1, 1 } },
3077 { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, { 6, 1, 1, 1 } },
3078 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, { 5, 1, 1, 1 } },
3079 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, { 5, 1, 1, 1 } },
3080 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, { 8, 1, 1, 1 } },
3081 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, { 8, 1, 1, 1 } },
3082
3083 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3084 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, { 4, 1, 1, 1 } },
3085 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, { 2, 1, 1, 1 } },
3086 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, { 3, 1, 1, 1 } },
3087 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, { 1, 1, 1, 1 } },
3088 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, { 2, 1, 1, 1 } },
3089 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, { 2, 1, 1, 1 } },
3090 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, { 3, 1, 1, 1 } },
3091 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, { 1, 1, 1, 1 } },
3092 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, { 2, 1, 1, 1 } },
3093 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, { 1, 1, 1, 1 } },
3094 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, { 2, 1, 1, 1 } },
3095
3096 // These truncates are really widening elements.
3097 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, { 1, 1, 1, 1 } }, // PSHUFD
3098 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, { 2, 1, 1, 1 } }, // PUNPCKLWD+DQ
3099 { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, { 3, 1, 1, 1 } }, // PUNPCKLBW+WD+PSHUFD
3100 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, { 1, 1, 1, 1 } }, // PUNPCKLWD
3101 { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, { 2, 1, 1, 1 } }, // PUNPCKLBW+WD
3102 { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, { 1, 1, 1, 1 } }, // PUNPCKLBW
3103
3104 { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, { 2, 1, 1, 1 } }, // PAND+PACKUSWB
3105 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, { 3, 1, 1, 1 } },
3106 { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, { 3, 1, 1, 1 } }, // PAND+2*PACKUSWB
3107 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, { 7, 1, 1, 1 } },
3108 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, { 1, 1, 1, 1 } },
3109 { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, { 3, 1, 1, 1 } },
3110 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, { 5, 1, 1, 1 } },
3111 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, {10, 1, 1, 1 } },
3112 { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, { 4, 1, 1, 1 } }, // PAND+3*PACKUSWB
3113 { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, { 2, 1, 1, 1 } }, // PSHUFD+PSHUFLW
3114 { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, { 1, 1, 1, 1 } }, // PSHUFD
3115 };
3116
3117 static const TypeConversionCostKindTblEntry F16ConversionTbl[] = {
3118 { ISD::FP_ROUND, MVT::f16, MVT::f32, { 1, 1, 1, 1 } },
3119 { ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, { 1, 1, 1, 1 } },
3120 { ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, { 1, 1, 1, 1 } },
3121 { ISD::FP_EXTEND, MVT::f32, MVT::f16, { 1, 1, 1, 1 } },
3122 { ISD::FP_EXTEND, MVT::f64, MVT::f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3123 { ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, { 1, 1, 1, 1 } },
3124 { ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, { 1, 1, 1, 1 } },
3125 { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, { 2, 1, 1, 1 } }, // vcvtph2ps+vcvtps2pd
3126 };
3127
3128 // Attempt to map directly to (simple) MVT types to let us match custom entries.
3129 EVT SrcTy = TLI->getValueType(DL, Src);
3130 EVT DstTy = TLI->getValueType(DL, Dst);
3131
3132 // The function getSimpleVT only handles simple value types.
3133 if (SrcTy.isSimple() && DstTy.isSimple()) {
3134 MVT SimpleSrcTy = SrcTy.getSimpleVT();
3135 MVT SimpleDstTy = DstTy.getSimpleVT();
3136
3137 if (ST->useAVX512Regs()) {
3138 if (ST->hasBWI())
3139 if (const auto *Entry = ConvertCostTableLookup(
3140 AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3141 if (auto KindCost = Entry->Cost[CostKind])
3142 return *KindCost;
3143
3144 if (ST->hasDQI())
3145 if (const auto *Entry = ConvertCostTableLookup(
3146 AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3147 if (auto KindCost = Entry->Cost[CostKind])
3148 return *KindCost;
3149
3150 if (ST->hasAVX512())
3151 if (const auto *Entry = ConvertCostTableLookup(
3152 AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3153 if (auto KindCost = Entry->Cost[CostKind])
3154 return *KindCost;
3155 }
3156
3157 if (ST->hasBWI())
3158 if (const auto *Entry = ConvertCostTableLookup(
3159 AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3160 if (auto KindCost = Entry->Cost[CostKind])
3161 return *KindCost;
3162
3163 if (ST->hasDQI())
3164 if (const auto *Entry = ConvertCostTableLookup(
3165 AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
3166 if (auto KindCost = Entry->Cost[CostKind])
3167 return *KindCost;
3168
3169 if (ST->hasAVX512())
3170 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3171 SimpleDstTy, SimpleSrcTy))
3172 if (auto KindCost = Entry->Cost[CostKind])
3173 return *KindCost;
3174
3175 if (ST->hasAVX2()) {
3176 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3177 SimpleDstTy, SimpleSrcTy))
3178 if (auto KindCost = Entry->Cost[CostKind])
3179 return *KindCost;
3180 }
3181
3182 if (ST->hasAVX()) {
3183 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3184 SimpleDstTy, SimpleSrcTy))
3185 if (auto KindCost = Entry->Cost[CostKind])
3186 return *KindCost;
3187 }
3188
3189 if (ST->hasF16C()) {
3190 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3191 SimpleDstTy, SimpleSrcTy))
3192 if (auto KindCost = Entry->Cost[CostKind])
3193 return *KindCost;
3194 }
3195
3196 if (ST->hasSSE41()) {
3197 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3198 SimpleDstTy, SimpleSrcTy))
3199 if (auto KindCost = Entry->Cost[CostKind])
3200 return *KindCost;
3201 }
3202
3203 if (ST->hasSSE2()) {
3204 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3205 SimpleDstTy, SimpleSrcTy))
3206 if (auto KindCost = Entry->Cost[CostKind])
3207 return *KindCost;
3208 }
3209
3210 if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) ||
3211 (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) {
3212 // fp16 conversions not covered by any table entries require a libcall.
3213 // Return a large (arbitrary) number to model this.
3214 return InstructionCost(64);
3215 }
3216 }
3217
3218 // Fall back to legalized types.
3219 std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src);
3220 std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst);
3221
3222 // If we're truncating to the same legalized type - just assume its free.
3223 if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second)
3224 return TTI::TCC_Free;
3225
3226 if (ST->useAVX512Regs()) {
3227 if (ST->hasBWI())
3228 if (const auto *Entry = ConvertCostTableLookup(
3229 AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
3230 if (auto KindCost = Entry->Cost[CostKind])
3231 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3232
3233 if (ST->hasDQI())
3234 if (const auto *Entry = ConvertCostTableLookup(
3235 AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
3236 if (auto KindCost = Entry->Cost[CostKind])
3237 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3238
3239 if (ST->hasAVX512())
3240 if (const auto *Entry = ConvertCostTableLookup(
3241 AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
3242 if (auto KindCost = Entry->Cost[CostKind])
3243 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3244 }
3245
3246 if (ST->hasBWI())
3247 if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
3248 LTDest.second, LTSrc.second))
3249 if (auto KindCost = Entry->Cost[CostKind])
3250 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3251
3252 if (ST->hasDQI())
3253 if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
3254 LTDest.second, LTSrc.second))
3255 if (auto KindCost = Entry->Cost[CostKind])
3256 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3257
3258 if (ST->hasAVX512())
3259 if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
3260 LTDest.second, LTSrc.second))
3261 if (auto KindCost = Entry->Cost[CostKind])
3262 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3263
3264 if (ST->hasAVX2())
3265 if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
3266 LTDest.second, LTSrc.second))
3267 if (auto KindCost = Entry->Cost[CostKind])
3268 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3269
3270 if (ST->hasAVX())
3271 if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
3272 LTDest.second, LTSrc.second))
3273 if (auto KindCost = Entry->Cost[CostKind])
3274 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3275
3276 if (ST->hasF16C()) {
3277 if (const auto *Entry = ConvertCostTableLookup(F16ConversionTbl, ISD,
3278 LTDest.second, LTSrc.second))
3279 if (auto KindCost = Entry->Cost[CostKind])
3280 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3281 }
3282
3283 if (ST->hasSSE41())
3284 if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
3285 LTDest.second, LTSrc.second))
3286 if (auto KindCost = Entry->Cost[CostKind])
3287 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3288
3289 if (ST->hasSSE2())
3290 if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
3291 LTDest.second, LTSrc.second))
3292 if (auto KindCost = Entry->Cost[CostKind])
3293 return std::max(LTSrc.first, LTDest.first) * *KindCost;
3294
3295 // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
3296 // sitofp.
3297 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3298 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
3299 Type *ExtSrc = Src->getWithNewBitWidth(32);
3300 unsigned ExtOpc =
3301 (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
3302
3303 // For scalar loads the extend would be free.
3304 InstructionCost ExtCost = 0;
3305 if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
3306 ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
3307
3308 return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
3310 }
3311
3312 // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
3313 // i32.
3314 if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
3315 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
3316 Type *TruncDst = Dst->getWithNewBitWidth(32);
3317 return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
3318 getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
3320 }
3321
3322 // TODO: Allow non-throughput costs that aren't binary.
3323 auto AdjustCost = [&CostKind](InstructionCost Cost,
3326 return Cost == 0 ? 0 : N;
3327 return Cost * N;
3328 };
3329 return AdjustCost(
3330 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3331}
3332
3334 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3336 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
3337 // Early out if this type isn't scalar/vector integer/float.
3338 if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy()))
3339 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3340 Op1Info, Op2Info, I);
3341
3342 // Legalize the type.
3343 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3344
3345 MVT MTy = LT.second;
3346
3347 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3348 assert(ISD && "Invalid opcode");
3349
3350 InstructionCost ExtraCost = 0;
3351 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
3352 // Some vector comparison predicates cost extra instructions.
3353 // TODO: Adjust ExtraCost based on CostKind?
3354 // TODO: Should we invert this and assume worst case cmp costs
3355 // and reduce for particular predicates?
3356 if (MTy.isVector() &&
3357 !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
3358 (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
3359 ST->hasBWI())) {
3360 // Fallback to I if a specific predicate wasn't specified.
3361 CmpInst::Predicate Pred = VecPred;
3362 if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
3364 Pred = cast<CmpInst>(I)->getPredicate();
3365
3366 bool CmpWithConstant = false;
3367 if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(I))
3368 CmpWithConstant = isa<Constant>(CmpInstr->getOperand(1));
3369
3370 switch (Pred) {
3372 // xor(cmpeq(x,y),-1)
3373 ExtraCost = CmpWithConstant ? 0 : 1;
3374 break;
3377 // xor(cmpgt(x,y),-1)
3378 ExtraCost = CmpWithConstant ? 0 : 1;
3379 break;
3382 // cmpgt(xor(x,signbit),xor(y,signbit))
3383 // xor(cmpeq(pmaxu(x,y),x),-1)
3384 ExtraCost = CmpWithConstant ? 1 : 2;
3385 break;
3388 if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
3389 (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
3390 // cmpeq(psubus(x,y),0)
3391 // cmpeq(pminu(x,y),x)
3392 ExtraCost = 1;
3393 } else {
3394 // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
3395 ExtraCost = CmpWithConstant ? 2 : 3;
3396 }
3397 break;
3400 // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases.
3401 // Use FCMP_UEQ expansion - FCMP_ONE should be the same.
3402 if (CondTy && !ST->hasAVX())
3403 return getCmpSelInstrCost(Opcode, ValTy, CondTy,
3405 Op1Info, Op2Info) +
3406 getCmpSelInstrCost(Opcode, ValTy, CondTy,
3408 Op1Info, Op2Info) +
3409 getArithmeticInstrCost(Instruction::Or, CondTy, CostKind);
3410
3411 break;
3414 // Assume worst case scenario and add the maximum extra cost.
3415 ExtraCost = 3;
3416 break;
3417 default:
3418 break;
3419 }
3420 }
3421 }
3422
3423 static const CostKindTblEntry SLMCostTbl[] = {
3424 // slm pcmpeq/pcmpgt throughput is 2
3425 { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } },
3426 // slm pblendvb/blendvpd/blendvps throughput is 4
3427 { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd
3428 { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps
3429 { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb
3430 { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb
3431 { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb
3432 { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb
3433 };
3434
3435 static const CostKindTblEntry AVX512BWCostTbl[] = {
3436 { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } },
3437 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } },
3438 { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } },
3439 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } },
3440
3441 { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } },
3442 { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } },
3443 };
3444
3445 static const CostKindTblEntry AVX512CostTbl[] = {
3446 { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } },
3447 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } },
3448 { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } },
3449 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } },
3450
3451 { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } },
3452 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } },
3453 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3454 { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } },
3455 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } },
3456 { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } },
3457 { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } },
3458
3459 { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } },
3460 { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } },
3461 { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } },
3462 { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } },
3463 { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } },
3464 { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } },
3465 { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } },
3466 { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } },
3467 { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } },
3468 { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } },
3469 { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } },
3470 { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } },
3471 { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } },
3472 { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } },
3473
3474 { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } },
3475 { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } },
3476 { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } },
3477 { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } },
3478 { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } },
3479 { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } },
3480 };
3481
3482 static const CostKindTblEntry AVX2CostTbl[] = {
3483 { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } },
3484 { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } },
3485 { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } },
3486 { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } },
3487 { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } },
3488 { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } },
3489
3490 { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } },
3491 { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } },
3492 { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } },
3493 { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } },
3494
3495 { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd
3496 { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps
3497 { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb
3498 { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb
3499 { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb
3500 { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb
3501 };
3502
3503 static const CostKindTblEntry XOPCostTbl[] = {
3504 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3505 { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } },
3506 };
3507
3508 static const CostKindTblEntry AVX1CostTbl[] = {
3509 { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } },
3510 { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } },
3511 { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } },
3512 { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } },
3513 { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } },
3514 { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } },
3515
3516 // AVX1 does not support 8-wide integer compare.
3517 { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } },
3518 { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } },
3519 { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } },
3520 { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } },
3521
3522 { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd
3523 { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps
3524 { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd
3525 { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps
3526 { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3527 { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps
3528 };
3529
3530 static const CostKindTblEntry SSE42CostTbl[] = {
3531 { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } },
3532 };
3533
3534 static const CostKindTblEntry SSE41CostTbl[] = {
3535 { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } },
3536 { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } },
3537
3538 { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd
3539 { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd
3540 { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps
3541 { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps
3542 { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb
3543 { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb
3544 { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb
3545 { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb
3546 };
3547
3548 static const CostKindTblEntry SSE2CostTbl[] = {
3549 { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } },
3550 { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } },
3551
3552 { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion
3553 { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } },
3554 { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } },
3555 { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } },
3556
3557 { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3558 { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd
3559 { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por
3560 { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por
3561 { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por
3562 { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por
3563 };
3564
3565 static const CostKindTblEntry SSE1CostTbl[] = {
3566 { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } },
3567 { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } },
3568
3569 { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3570 { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps
3571 };
3572
3573 if (ST->useSLMArithCosts())
3574 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
3575 if (auto KindCost = Entry->Cost[CostKind])
3576 return LT.first * (ExtraCost + *KindCost);
3577
3578 if (ST->hasBWI())
3579 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
3580 if (auto KindCost = Entry->Cost[CostKind])
3581 return LT.first * (ExtraCost + *KindCost);
3582
3583 if (ST->hasAVX512())
3584 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
3585 if (auto KindCost = Entry->Cost[CostKind])
3586 return LT.first * (ExtraCost + *KindCost);
3587
3588 if (ST->hasAVX2())
3589 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
3590 if (auto KindCost = Entry->Cost[CostKind])
3591 return LT.first * (ExtraCost + *KindCost);
3592
3593 if (ST->hasXOP())
3594 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
3595 if (auto KindCost = Entry->Cost[CostKind])
3596 return LT.first * (ExtraCost + *KindCost);
3597
3598 if (ST->hasAVX())
3599 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
3600 if (auto KindCost = Entry->Cost[CostKind])
3601 return LT.first * (ExtraCost + *KindCost);
3602
3603 if (ST->hasSSE42())
3604 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
3605 if (auto KindCost = Entry->Cost[CostKind])
3606 return LT.first * (ExtraCost + *KindCost);
3607
3608 if (ST->hasSSE41())
3609 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
3610 if (auto KindCost = Entry->Cost[CostKind])
3611 return LT.first * (ExtraCost + *KindCost);
3612
3613 if (ST->hasSSE2())
3614 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
3615 if (auto KindCost = Entry->Cost[CostKind])
3616 return LT.first * (ExtraCost + *KindCost);
3617
3618 if (ST->hasSSE1())
3619 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
3620 if (auto KindCost = Entry->Cost[CostKind])
3621 return LT.first * (ExtraCost + *KindCost);
3622
3623 // Assume a 3cy latency for fp select ops.
3624 if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select)
3625 if (ValTy->getScalarType()->isFloatingPointTy())
3626 return 3;
3627
3628 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3629 Op1Info, Op2Info, I);
3630}
3631
3633
3637 // Costs should match the codegen from:
3638 // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
3639 // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
3640 // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
3641 // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
3642 // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
3643
3644 // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
3645 // specialized in these tables yet.
3646 static const CostKindTblEntry AVX512VBMI2CostTbl[] = {
3647 { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } },
3648 { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } },
3649 { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } },
3650 { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } },
3651 { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } },
3652 { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } },
3653 { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } },
3654 { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } },
3655 { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } },
3656 { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } },
3657 { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } },
3658 { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } },
3659 { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } },
3660 { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } },
3661 { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } },
3662 { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } },
3663 { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } },
3664 { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } },
3665 };
3666 static const CostKindTblEntry AVX512BITALGCostTbl[] = {
3667 { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } },
3668 { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } },
3669 { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } },
3670 { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } },
3671 { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } },
3672 { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } },
3673 };
3674 static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = {
3675 { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } },
3676 { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } },
3677 { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } },
3678 { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } },
3679 { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } },
3680 { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } },
3681 };
3682 static const CostKindTblEntry AVX512CDCostTbl[] = {
3683 { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } },
3684 { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } },
3685 { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } },
3686 { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } },
3687 { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } },
3688 { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } },
3689 { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } },
3690 { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } },
3691 { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } },
3692 { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } },
3693 { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } },
3694 { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } },
3695
3696 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3697 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3698 { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } },
3699 { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } },
3700 { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } },
3701 { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } },
3702 };
3703 static const CostKindTblEntry AVX512BWCostTbl[] = {
3704 { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } },
3705 { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } },
3706 { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } },
3707 { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } },
3708 { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } },
3709 { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } },
3710 { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } },
3711 { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } },
3712 { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } },
3713 { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } },
3714 { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } },
3715 { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } },
3716 { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } },
3717 { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } },
3718 { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } },
3719 { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } },
3720 { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } },
3721 { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } },
3722 { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } },
3723 { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } },
3724 { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } },
3725 { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } },
3726 { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } },
3727 { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } },
3728 { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } },
3729 { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } },
3730 { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } },
3731 { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } },
3732 { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } },
3733 { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } },
3734 { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } },
3735 { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } },
3736 { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } },
3737 { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } },
3738 { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } },
3739 { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } },
3740 { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } },
3741 { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } },
3742 { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } },
3743 { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } },
3744 { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } },
3745 { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } },
3746 { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } },
3747 { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } },
3748 { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } },
3749 { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } },
3750 { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } },
3751 { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } },
3752 { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } },
3753 { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } },
3754 { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } },
3755 { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } },
3756 { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } },
3757 { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } },
3758 { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } },
3759 { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } },
3760 { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } },
3761 { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } },
3762 { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } },
3763 { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } },
3764 { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } },
3765 { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } },
3766 { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } },
3767 { ISD::SADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3768 { ISD::SADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3769 { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3770 { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3771 { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3772 { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3773 { ISD::SMULO, MVT::v32i16, { 3, 6, 4, 4 } },
3774 { ISD::SMULO, MVT::v64i8, { 8, 21, 17, 18 } },
3775 { ISD::UMULO, MVT::v32i16, { 2, 5, 3, 3 } },
3776 { ISD::UMULO, MVT::v64i8, { 8, 15, 15, 16 } },
3777 { ISD::SSUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3778 { ISD::SSUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3779 { ISD::UADDSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3780 { ISD::UADDSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3781 { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } },
3782 { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } },
3783 { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } },
3784 { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } },
3785 { ISD::USUBSAT, MVT::v32i16, { 1, 1, 1, 1 } },
3786 { ISD::USUBSAT, MVT::v64i8, { 1, 1, 1, 1 } },
3787 };
3788 static const CostKindTblEntry AVX512CostTbl[] = {
3789 { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } },
3790 { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } },
3791 { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } },
3792 { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } },
3793 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } },
3794 { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } },
3795 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } },
3796 { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } },
3797 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } },
3798 { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } },
3799 { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } },
3800 { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } },
3801 { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } },
3802 { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } },
3803 { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } },
3804 { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } },
3805 { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } },
3806 { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } },
3807 { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } },
3808 { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } },
3809 { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } },
3810 { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } },
3811 { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } },
3812 { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } },
3813 { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } },
3814 { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } },
3815 { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } },
3816 { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } },
3817 { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } },
3818 { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } },
3819 { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } },
3820 { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } },
3821 { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } },
3822 { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } },
3823 { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } },
3824 { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } },
3825 { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } },
3826 { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } },
3827 { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } },
3828 { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } },
3829 { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } },
3830 { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } },
3831 { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } },
3832 { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } },
3833 { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } },
3834 { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } },
3835 { ISD::SADDSAT, MVT::v2i64, { 3, 3, 8, 9 } },
3836 { ISD::SADDSAT, MVT::v4i64, { 2, 2, 6, 7 } },
3837 { ISD::SADDSAT, MVT::v8i64, { 3, 3, 6, 7 } },
3838 { ISD::SADDSAT, MVT::v4i32, { 2, 2, 6, 7 } },
3839 { ISD::SADDSAT, MVT::v8i32, { 2, 2, 6, 7 } },
3840 { ISD::SADDSAT, MVT::v16i32, { 3, 3, 6, 7 } },
3841 { ISD::SADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3842 { ISD::SADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3843 { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3844 { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3845 { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3846 { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3847 { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3848 { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3849 { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3850 { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3851 { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3852 { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3853 { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3854 { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3855 { ISD::SMULO, MVT::v8i64, { 44, 44, 81, 93 } },
3856 { ISD::SMULO, MVT::v16i32, { 5, 12, 9, 11 } },
3857 { ISD::SMULO, MVT::v32i16, { 6, 12, 17, 17 } },
3858 { ISD::SMULO, MVT::v64i8, { 22, 28, 42, 42 } },
3859 { ISD::SSUBSAT, MVT::v2i64, { 2, 13, 9, 10 } },
3860 { ISD::SSUBSAT, MVT::v4i64, { 2, 15, 7, 8 } },
3861 { ISD::SSUBSAT, MVT::v8i64, { 2, 14, 7, 8 } },
3862 { ISD::SSUBSAT, MVT::v4i32, { 2, 14, 7, 8 } },
3863 { ISD::SSUBSAT, MVT::v8i32, { 2, 15, 7, 8 } },
3864 { ISD::SSUBSAT, MVT::v16i32, { 2, 14, 7, 8 } },
3865 { ISD::SSUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3866 { ISD::SSUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3867 { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } },
3868 { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } },
3869 { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } },
3870 { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } },
3871 { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } },
3872 { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } },
3873 { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } },
3874 { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } },
3875 { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } },
3876 { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } },
3877 { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } },
3878 { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } },
3879 { ISD::UMULO, MVT::v8i64, { 52, 52, 95, 104} },
3880 { ISD::UMULO, MVT::v16i32, { 5, 12, 8, 10 } },
3881 { ISD::UMULO, MVT::v32i16, { 5, 13, 16, 16 } },
3882 { ISD::UMULO, MVT::v64i8, { 18, 24, 30, 30 } },
3883 { ISD::UADDSAT, MVT::v2i64, { 1, 4, 4, 4 } },
3884 { ISD::UADDSAT, MVT::v4i64, { 1, 4, 4, 4 } },
3885 { ISD::UADDSAT, MVT::v8i64, { 1, 4, 4, 4 } },
3886 { ISD::UADDSAT, MVT::v4i32, { 1, 2, 4, 4 } },
3887 { ISD::UADDSAT, MVT::v8i32, { 1, 2, 4, 4 } },
3888 { ISD::UADDSAT, MVT::v16i32, { 2, 2, 4, 4 } },
3889 { ISD::UADDSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3890 { ISD::UADDSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3891 { ISD::USUBSAT, MVT::v2i64, { 1, 4, 2, 2 } },
3892 { ISD::USUBSAT, MVT::v4i64, { 1, 4, 2, 2 } },
3893 { ISD::USUBSAT, MVT::v8i64, { 1, 4, 2, 2 } },
3894 { ISD::USUBSAT, MVT::v8i32, { 1, 2, 2, 2 } },
3895 { ISD::USUBSAT, MVT::v16i32, { 1, 2, 2, 2 } },
3896 { ISD::USUBSAT, MVT::v32i16, { 2, 2, 2, 2 } },
3897 { ISD::USUBSAT, MVT::v64i8, { 2, 2, 2, 2 } },
3898 { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } },
3899 { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } },
3900 { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } },
3901 { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } },
3902 { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } },
3903 { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } },
3904 { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } },
3905 { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } },
3906 { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3907 { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3908 { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/
3909 { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/
3910 { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3911 { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3912 { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/
3913 { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/
3914 };
3915 static const CostKindTblEntry XOPCostTbl[] = {
3916 { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } },
3917 { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } },
3918 { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } },
3919 { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } },
3920 { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } },
3921 { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } },
3922 { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } },
3923 { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } },
3924 { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } },
3925 { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } },
3926 { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } },
3927 { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } },
3928 // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
3929 { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } },
3930 { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } },
3931 { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } },
3932 { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } },
3933 { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } },
3934 { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } },
3935 { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } },
3936 { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } },
3937 { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } },
3938 { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } },
3939 { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } },
3940 { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } },
3941 { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } },
3942 { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } },
3943 { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } },
3944 { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } },
3945 { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } },
3946 { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } },
3947 { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } },
3948 { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } },
3949 { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } },
3950 { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } },
3951 { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } },
3952 { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } },
3953 };
3954 static const CostKindTblEntry AVX2CostTbl[] = {
3955 { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3956 { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
3957 { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } },
3958 { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } },
3959 { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } },
3960 { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } },
3961 { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } },
3962 { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } },
3963 { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } },
3964 { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } },
3965 { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } },
3966 { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } },
3967 { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } },
3968 { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } },
3969 { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } },
3970 { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } },
3971 { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } },
3972 { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } },
3973 { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } },
3974 { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } },
3975 { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } },
3976 { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } },
3977 { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } },
3978 { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } },
3979 { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } },
3980 { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } },
3981 { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } },
3982 { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } },
3983 { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } },
3984 { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } },
3985 { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } },
3986 { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } },
3987 { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } },
3988 { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } },
3989 { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } },
3990 { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } },
3991 { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } },
3992 { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } },
3993 { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } },
3994 { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } },
3995 { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } },
3996 { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } },
3997 { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } },
3998 { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } },
3999 { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } },
4000 { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } },
4001 { ISD::SADDSAT, MVT::v2i64, { 4, 13, 8, 11 } },
4002 { ISD::SADDSAT, MVT::v4i64, { 3, 10, 8, 12 } },
4003 { ISD::SADDSAT, MVT::v4i32, { 2, 6, 7, 9 } },
4004 { ISD::SADDSAT, MVT::v8i32, { 4, 6, 7, 13 } },
4005 { ISD::SADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4006 { ISD::SADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4007 { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } },
4008 { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } },
4009 { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4010 { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4011 { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4012 { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } },
4013 { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } },
4014 { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4015 { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4016 { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4017 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4018 { ISD::SMULO, MVT::v2i64, { 8, 8, 13, 15 } },
4019 { ISD::SMULO, MVT::v8i32, { 8, 20, 13, 24 } },
4020 { ISD::SMULO, MVT::v4i32, { 5, 15, 11, 12 } },
4021 { ISD::SMULO, MVT::v16i16, { 4, 14, 8, 14 } },
4022 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4023 { ISD::SMULO, MVT::v32i8, { 9, 15, 18, 35 } },
4024 { ISD::SMULO, MVT::v16i8, { 6, 22, 14, 21 } },
4025 { ISD::SSUBSAT, MVT::v2i64, { 4, 13, 9, 13 } },
4026 { ISD::SSUBSAT, MVT::v4i64, { 4, 15, 9, 13 } },
4027 { ISD::SSUBSAT, MVT::v4i32, { 3, 14, 9, 11 } },
4028 { ISD::SSUBSAT, MVT::v8i32, { 4, 15, 9, 16 } },
4029 { ISD::SSUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4030 { ISD::SSUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4031 { ISD::UADDSAT, MVT::v2i64, { 2, 8, 6, 6 } },
4032 { ISD::UADDSAT, MVT::v4i64, { 3, 8, 6, 10 } },
4033 { ISD::UADDSAT, MVT::v8i32, { 2, 2, 4, 8 } },
4034 { ISD::UADDSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4035 { ISD::UADDSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4036 { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } },
4037 { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } },
4038 { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } },
4039 { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } },
4040 { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } },
4041 { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } },
4042 { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } },
4043 { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } },
4044 { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } },
4045 { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } },
4046 { ISD::UMULO, MVT::v4i64, { 24, 24, 39, 43 } },
4047 { ISD::UMULO, MVT::v2i64, { 10, 10, 15, 19 } },
4048 { ISD::UMULO, MVT::v8i32, { 8, 11, 13, 23 } },
4049 { ISD::UMULO, MVT::v4i32, { 5, 12, 11, 12 } },
4050 { ISD::UMULO, MVT::v16i16, { 4, 6, 8, 13 } },
4051 { ISD::UMULO, MVT::v8i16, { 2, 8, 6, 6 } },
4052 { ISD::UMULO, MVT::v32i8, { 9, 13, 17, 33 } },
4053 { ISD::UMULO, MVT::v16i8, { 6, 19, 13, 20 } },
4054 { ISD::USUBSAT, MVT::v2i64, { 2, 7, 6, 6 } },
4055 { ISD::USUBSAT, MVT::v4i64, { 3, 7, 6, 10 } },
4056 { ISD::USUBSAT, MVT::v8i32, { 2, 2, 2, 4 } },
4057 { ISD::USUBSAT, MVT::v16i16, { 1, 1, 1, 2 } },
4058 { ISD::USUBSAT, MVT::v32i8, { 1, 1, 1, 2 } },
4059 { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4060 { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4061 { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4062 { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4063 { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4064 { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4065 { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss
4066 { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps
4067 { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps
4068 { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd
4069 { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd
4070 { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd
4071 };
4072 static const CostKindTblEntry AVX1CostTbl[] = {
4073 { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
4074 { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } },
4075 { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } },
4076 { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } },
4077 { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4078 { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } },
4079 { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4080 { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } },
4081 { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert
4082 { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } },
4083 { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
4084 { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } },
4085 { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } },
4086 { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } },
4087 { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } },
4088 { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } },
4089 { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } },
4090 { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } },
4091 { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
4092 { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } },
4093 { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert
4094 { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } },
4095 { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert
4096 { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } },
4097 { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4098 { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } },
4099 { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert
4100 { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } },
4101 { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4102 { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } },
4103 { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert
4104 { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } },
4105 { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert
4106 { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } },
4107 { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert
4108 { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } },
4109 { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert
4110 { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } },
4111 { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert
4112 { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } },
4113 { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert
4114 { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } },
4115 { ISD::SADDSAT, MVT::v2i64, { 6, 13, 8, 11 } },
4116 { ISD::SADDSAT, MVT::v4i64, { 13, 20, 15, 25 } }, // 2 x 128-bit Op + extract/insert
4117 { ISD::SADDSAT, MVT::v8i32, { 12, 18, 14, 24 } }, // 2 x 128-bit Op + extract/insert
4118 { ISD::SADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4119 { ISD::SADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4120 { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4121 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } },
4122 { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4123 { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4124 { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4125 { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert
4126 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4127 { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4128 { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4129 { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4130 { ISD::SMULO, MVT::v4i64, { 20, 20, 33, 37 } },
4131 { ISD::SMULO, MVT::v2i64, { 9, 9, 13, 17 } },
4132 { ISD::SMULO, MVT::v8i32, { 15, 20, 24, 29 } },
4133 { ISD::SMULO, MVT::v4i32, { 7, 15, 11, 13 } },
4134 { ISD::SMULO, MVT::v16i16, { 8, 14, 14, 15 } },
4135 { ISD::SMULO, MVT::v8i16, { 3, 9, 6, 6 } },
4136 { ISD::SMULO, MVT::v32i8, { 20, 20, 37, 39 } },
4137 { ISD::SMULO, MVT::v16i8, { 9, 22, 18, 21 } },
4138 { ISD::SSUBSAT, MVT::v2i64, { 7, 13, 9, 13 } },
4139 { ISD::SSUBSAT, MVT::v4i64, { 15, 21, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4140 { ISD::SSUBSAT, MVT::v8i32, { 15, 19, 18, 29 } }, // 2 x 128-bit Op + extract/insert
4141 { ISD::SSUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4142 { ISD::SSUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4143 { ISD::UADDSAT, MVT::v2i64, { 3, 8, 6, 6 } },
4144 { ISD::UADDSAT, MVT::v4i64, { 8, 11, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4145 { ISD::UADDSAT, MVT::v8i32, { 6, 6, 10, 11 } }, // 2 x 128-bit Op + extract/insert
4146 { ISD::UADDSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4147 { ISD::UADDSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4148 { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4149 { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } },
4150 { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4151 { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4152 { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4153 { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert
4154 { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } },
4155 { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4156 { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4157 { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4158 { ISD::UMULO, MVT::v4i64, { 24, 26, 39, 45 } },
4159 { ISD::UMULO, MVT::v2i64, { 10, 12, 15, 20 } },
4160 { ISD::UMULO, MVT::v8i32, { 14, 15, 23, 28 } },
4161 { ISD::UMULO, MVT::v4i32, { 7, 12, 11, 13 } },
4162 { ISD::UMULO, MVT::v16i16, { 7, 11, 13, 14 } },
4163 { ISD::UMULO, MVT::v8i16, { 3, 8, 6, 6 } },
4164 { ISD::UMULO, MVT::v32i8, { 19, 19, 35, 37 } },
4165 { ISD::UMULO, MVT::v16i8, { 9, 19, 17, 20 } },
4166 { ISD::USUBSAT, MVT::v2i64, { 3, 7, 6, 6 } },
4167 { ISD::USUBSAT, MVT::v4i64, { 8, 10, 14, 15 } }, // 2 x 128-bit Op + extract/insert
4168 { ISD::USUBSAT, MVT::v8i32, { 4, 4, 7, 8 } }, // 2 x 128-bit Op + extract/insert
4169 { ISD::USUBSAT, MVT::v8i32, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4170 { ISD::USUBSAT, MVT::v16i16, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4171 { ISD::USUBSAT, MVT::v32i8, { 3, 3, 5, 6 } }, // 2 x 128-bit Op + extract/insert
4172 { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4173 { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4174 { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4175 { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4176 { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4177 { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4178 { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss
4179 { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps
4180 { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps
4181 { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd
4182 { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd
4183 { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd
4184 };
4185 static const CostKindTblEntry GFNICostTbl[] = {
4186 { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb
4187 { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb
4188 { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb
4189 { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb
4190 { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4191 { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4192 { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4193 { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb
4194 { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4195 { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb
4196 { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb
4197 { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4198 { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb
4199 { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
4200 { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4201 { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
4202 { X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4203 { X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4204 { X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
4205 };
4206 static const CostKindTblEntry GLMCostTbl[] = {
4207 { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
4208 { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps
4209 { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd
4210 { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd
4211 };
4212 static const CostKindTblEntry SLMCostTbl[] = {
4213 { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } },
4214 { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } },
4215 { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } },
4216 { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss
4217 { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps
4218 { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd
4219 { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd
4220 };
4221 static const CostKindTblEntry SSE42CostTbl[] = {
4222 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS
4223 { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS
4224 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD
4225 { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD
4226 { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4227 { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/
4228 };
4229 static const CostKindTblEntry SSE41CostTbl[] = {
4230 { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X)
4231 { ISD::SADDSAT, MVT::v2i64, { 10, 14, 17, 21 } },
4232 { ISD::SADDSAT, MVT::v4i32, { 5, 11, 8, 10 } },
4233 { ISD::SSUBSAT, MVT::v2i64, { 12, 19, 25, 29 } },
4234 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 10, 12 } },
4235 { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } },
4236 { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4237 { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4238 { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } },
4239 { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4240 { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4241 { ISD::SMULO, MVT::v2i64, { 9, 11, 13, 17 } },
4242 { ISD::SMULO, MVT::v4i32, { 20, 24, 13, 19 } },
4243 { ISD::SMULO, MVT::v8i16, { 5, 9, 8, 8 } },
4244 { ISD::SMULO, MVT::v16i8, { 13, 22, 24, 25 } },
4245 { ISD::UADDSAT, MVT::v2i64, { 6, 13, 14, 14 } },
4246 { ISD::UADDSAT, MVT::v4i32, { 2, 2, 4, 4 } },
4247 { ISD::USUBSAT, MVT::v2i64, { 6, 10, 14, 14 } },
4248 { ISD::USUBSAT, MVT::v4i32, { 1, 2, 2, 2 } },
4249 { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } },
4250 { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } },
4251 { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4252 { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } },
4253 { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } },
4254 { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4255 { ISD::UMULO, MVT::v2i64, { 14, 20, 15, 20 } },
4256 { ISD::UMULO, MVT::v4i32, { 19, 22, 12, 18 } },
4257 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4258 { ISD::UMULO, MVT::v16i8, { 13, 19, 18, 20 } },
4259 };
4260 static const CostKindTblEntry SSSE3CostTbl[] = {
4261 { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } },
4262 { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } },
4263 { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } },
4264 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } },
4265 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } },
4266 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } },
4267 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } },
4268 { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } },
4269 { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } },
4270 { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } },
4271 { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } },
4272 { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } },
4273 { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } },
4274 { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } },
4275 { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } },
4276 { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } },
4277 { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } },
4278 { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } },
4279 { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } },
4280 { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } },
4281 { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } },
4282 { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } }
4283 };
4284 static const CostKindTblEntry SSE2CostTbl[] = {
4285 { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } },
4286 { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } },
4287 { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } },
4288 { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } },
4289 { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } },
4290 { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } },
4291 { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } },
4292 { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } },
4293 { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } },
4294 { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } },
4295 { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } },
4296 { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } },
4297 { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } },
4298 { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } },
4299 { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } },
4300 { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } },
4301 { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } },
4302 { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } },
4303 { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } },
4304 { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } },
4305 { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } },
4306 { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } },
4307 { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } },
4308 { ISD::SADDSAT, MVT::v2i64, { 12, 14, 24, 24 } },
4309 { ISD::SADDSAT, MVT::v4i32, { 6, 11, 11, 12 } },
4310 { ISD::SADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4311 { ISD::SADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4312 { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4313 { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } },
4314 { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } },
4315 { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } },
4316 { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4317 { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } },
4318 { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } },
4319 { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } },
4320 { ISD::SMULO, MVT::v2i64, { 30, 33, 13, 23 } },
4321 { ISD::SMULO, MVT::v4i32, { 20, 24, 23, 23 } },
4322 { ISD::SMULO, MVT::v8i16, { 5, 10, 8, 8 } },
4323 { ISD::SMULO, MVT::v16i8, { 13, 23, 24, 25 } },
4324 { ISD::SSUBSAT, MVT::v2i64, { 16, 19, 31, 31 } },
4325 { ISD::SSUBSAT, MVT::v4i32, { 6, 14, 12, 13 } },
4326 { ISD::SSUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4327 { ISD::SSUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4328 { ISD::UADDSAT, MVT::v2i64, { 7, 13, 14, 14 } },
4329 { ISD::UADDSAT, MVT::v4i32, { 4, 5, 7, 7 } },
4330 { ISD::UADDSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4331 { ISD::UADDSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4332 { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } },
4333 { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } },
4334 { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } },
4335 { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } },
4336 { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } },
4337 { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } },
4338 { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } },
4339 { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } },
4340 { ISD::UMULO, MVT::v2i64, { 30, 33, 15, 29 } },
4341 { ISD::UMULO, MVT::v4i32, { 19, 22, 14, 18 } },
4342 { ISD::UMULO, MVT::v8i16, { 4, 9, 7, 7 } },
4343 { ISD::UMULO, MVT::v16i8, { 13, 19, 20, 20 } },
4344 { ISD::USUBSAT, MVT::v2i64, { 7, 10, 14, 14 } },
4345 { ISD::USUBSAT, MVT::v4i32, { 4, 4, 7, 7 } },
4346 { ISD::USUBSAT, MVT::v8i16, { 1, 2, 1, 1 } },
4347 { ISD::USUBSAT, MVT::v16i8, { 1, 2, 1, 1 } },
4348 { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } },
4349 { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } },
4350 { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4351 { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/
4352 };
4353 static const CostKindTblEntry SSE1CostTbl[] = {
4354 { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } },
4355 { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } },
4356 { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/
4357 { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/
4358 };
4359 static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets
4360 { ISD::CTTZ, MVT::i64, { 1, 1, 1, 1 } },
4361 };
4362 static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
4363 { ISD::CTTZ, MVT::i32, { 1, 1, 1, 1 } },
4364 { ISD::CTTZ, MVT::i16, { 2, 1, 1, 1 } },
4365 { ISD::CTTZ, MVT::i8, { 2, 1, 1, 1 } },
4366 };
4367 static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets
4368 { ISD::CTLZ, MVT::i64, { 1, 1, 1, 1 } },
4369 };
4370 static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
4371 { ISD::CTLZ, MVT::i32, { 1, 1, 1, 1 } },
4372 { ISD::CTLZ, MVT::i16, { 2, 1, 1, 1 } },
4373 { ISD::CTLZ, MVT::i8, { 2, 1, 1, 1 } },
4374 };
4375 static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets
4376 { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt
4377 };
4378 static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
4379 { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt
4380 { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext())
4381 { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext())
4382 };
4383 static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
4384 { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
4385 { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
4386 { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4387 { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4388 { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR
4389 { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR
4390 { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR
4391 { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
4392 { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF
4393 { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF
4394 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF
4395 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF
4396 { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
4397 { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
4398 { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
4399 { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } },
4400 { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } },
4401 { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } },
4402 { ISD::SADDSAT, MVT::i64, { 4, 4, 7, 10 } },
4403 { ISD::SSUBSAT, MVT::i64, { 4, 5, 8, 11 } },
4404 { ISD::UADDSAT, MVT::i64, { 2, 3, 4, 7 } },
4405 { ISD::USUBSAT, MVT::i64, { 2, 3, 4, 7 } },
4406 { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } },
4407 { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } },
4408 { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } },
4409 { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } },
4410 { ISD::SADDO, MVT::i64, { 2, 2, 4, 6 } },
4411 { ISD::UADDO, MVT::i64, { 2, 2, 4, 6 } },
4412 { ISD::SMULO, MVT::i64, { 4, 4, 4, 6 } },
4413 { ISD::UMULO, MVT::i64, { 8, 8, 4, 7 } },
4414 };
4415 static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
4416 { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4417 { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV
4418 { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA
4419 { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } },
4420 { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } },
4421 { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
4422 { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
4423 { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4424 { ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4425 { ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4426 { ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4427 { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
4428 { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
4429 { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR
4430 { ISD::CTTZ, MVT::i32, { 2, 2, 3, 3 } }, // TEST+BSF+CMOV/BRANCH
4431 { ISD::CTTZ, MVT::i16, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4432 { ISD::CTTZ, MVT::i8, { 2, 2, 2, 3 } }, // TEST+BSF+CMOV/BRANCH
4433 { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 1, 2 } }, // BSF
4434 { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 2 } }, // BSF
4435 { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 2 } }, // BSF
4436 { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } },
4437 { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } },
4438 { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } },
4439 { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } },
4440 { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } },
4441 { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } },
4442 { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } },
4443 { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } },
4444 { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } },
4445 { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } },
4446 { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } },
4447 { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } },
4448 { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } },
4449 { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } },
4450 { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } },
4451 { ISD::SADDSAT, MVT::i32, { 3, 4, 6, 9 } },
4452 { ISD::SADDSAT, MVT::i16, { 4, 4, 7, 10 } },
4453 { ISD::SADDSAT, MVT::i8, { 4, 5, 8, 11 } },
4454 { ISD::SSUBSAT, MVT::i32, { 4, 4, 7, 10 } },
4455 { ISD::SSUBSAT, MVT::i16, { 4, 4, 7, 10 } },
4456 { ISD::SSUBSAT, MVT::i8, { 4, 5, 8, 11 } },
4457 { ISD::UADDSAT, MVT::i32, { 2, 3, 4, 7 } },
4458 { ISD::UADDSAT, MVT::i16, { 2, 3, 4, 7 } },
4459 { ISD::UADDSAT, MVT::i8, { 3, 3, 5, 8 } },
4460 { ISD::USUBSAT, MVT::i32, { 2, 3, 4, 7 } },
4461 { ISD::USUBSAT, MVT::i16, { 2, 3, 4, 7 } },
4462 { ISD::USUBSAT, MVT::i8, { 3, 3, 5, 8 } },
4463 { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } },
4464 { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } },
4465 { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } },
4466 { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } },
4467 { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } },
4468 { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } },
4469 { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } },
4470 { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } },
4471 { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } },
4472 { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } },
4473 { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } },
4474 { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } },
4475 { ISD::SADDO, MVT::i32, { 2, 2, 4, 6 } },
4476 { ISD::SADDO, MVT::i16, { 2, 2, 4, 6 } },
4477 { ISD::SADDO, MVT::i8, { 2, 2, 4, 6 } },
4478 { ISD::UADDO, MVT::i32, { 2, 2, 4, 6 } },
4479 { ISD::UADDO, MVT::i16, { 2, 2, 4, 6 } },
4480 { ISD::UADDO, MVT::i8, { 2, 2, 4, 6 } },
4481 { ISD::SMULO, MVT::i32, { 2, 2, 4, 6 } },
4482 { ISD::SMULO, MVT::i16, { 5, 5, 4, 6 } },
4483 { ISD::SMULO, MVT::i8, { 6, 6, 4, 6 } },
4484 { ISD::UMULO, MVT::i32, { 6, 6, 4, 8 } },
4485 { ISD::UMULO, MVT::i16, { 6, 6, 4, 9 } },
4486 { ISD::UMULO, MVT::i8, { 6, 6, 4, 6 } },
4487 };
4488
4489 Type *RetTy = ICA.getReturnType();
4490 Type *OpTy = RetTy;
4491 Intrinsic::ID IID = ICA.getID();
4492 unsigned ISD = ISD::DELETED_NODE;
4493 switch (IID) {
4494 default:
4495 break;
4496 case Intrinsic::abs:
4497 ISD = ISD::ABS;
4498 break;
4499 case Intrinsic::bitreverse:
4501 break;
4502 case Intrinsic::bswap:
4503 ISD = ISD::BSWAP;
4504 break;
4505 case Intrinsic::ctlz:
4506 ISD = ISD::CTLZ;
4507 break;
4508 case Intrinsic::ctpop:
4509 ISD = ISD::CTPOP;
4510 break;
4511 case Intrinsic::cttz:
4512 ISD = ISD::CTTZ;
4513 break;
4514 case Intrinsic::fshl:
4515 ISD = ISD::FSHL;
4516 if (!ICA.isTypeBasedOnly()) {
4517 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4518 if (Args[0] == Args[1]) {
4519 ISD = ISD::ROTL;
4520 // Handle uniform constant rotation amounts.
4521 // TODO: Handle funnel-shift cases.
4522 const APInt *Amt;
4523 if (Args[2] &&
4526 }
4527 }
4528 break;
4529 case Intrinsic::fshr:
4530 // FSHR has same costs so don't duplicate.
4531 ISD = ISD::FSHL;
4532 if (!ICA.isTypeBasedOnly()) {
4533 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4534 if (Args[0] == Args[1]) {
4535 ISD = ISD::ROTR;
4536 // Handle uniform constant rotation amount.
4537 // TODO: Handle funnel-shift cases.
4538 const APInt *Amt;
4539 if (Args[2] &&
4542 }
4543 }
4544 break;
4545 case Intrinsic::lrint:
4546 case Intrinsic::llrint: {
4547 // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which
4548 // have the same costs as the CVTTP2SI (fptosi) instructions
4549 const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes();
4550 return getCastInstrCost(Instruction::FPToSI, RetTy, ArgTys[0],
4552 }
4553 case Intrinsic::maxnum:
4554 case Intrinsic::minnum:
4555 // FMINNUM has same costs so don't duplicate.
4556 ISD = ISD::FMAXNUM;
4557 break;
4558 case Intrinsic::sadd_sat:
4559 ISD = ISD::SADDSAT;
4560 break;
4561 case Intrinsic::smax:
4562 ISD = ISD::SMAX;
4563 break;
4564 case Intrinsic::smin:
4565 ISD = ISD::SMIN;
4566 break;
4567 case Intrinsic::ssub_sat:
4568 ISD = ISD::SSUBSAT;
4569 break;
4570 case Intrinsic::uadd_sat:
4571 ISD = ISD::UADDSAT;
4572 break;
4573 case Intrinsic::umax:
4574 ISD = ISD::UMAX;
4575 break;
4576 case Intrinsic::umin:
4577 ISD = ISD::UMIN;
4578 break;
4579 case Intrinsic::usub_sat:
4580 ISD = ISD::USUBSAT;
4581 break;
4582 case Intrinsic::sqrt:
4583 ISD = ISD::FSQRT;
4584 break;
4585 case Intrinsic::sadd_with_overflow:
4586 case Intrinsic::ssub_with_overflow:
4587 // SSUBO has same costs so don't duplicate.
4588 ISD = ISD::SADDO;
4589 OpTy = RetTy->getContainedType(0);
4590 break;
4591 case Intrinsic::uadd_with_overflow:
4592 case Intrinsic::usub_with_overflow:
4593 // USUBO has same costs so don't duplicate.
4594 ISD = ISD::UADDO;
4595 OpTy = RetTy->getContainedType(0);
4596 break;
4597 case Intrinsic::smul_with_overflow:
4598 ISD = ISD::SMULO;
4599 OpTy = RetTy->getContainedType(0);
4600 break;
4601 case Intrinsic::umul_with_overflow:
4602 ISD = ISD::UMULO;
4603 OpTy = RetTy->getContainedType(0);
4604 break;
4605 }
4606
4607 if (ISD != ISD::DELETED_NODE) {
4608 auto adjustTableCost = [&](int ISD, unsigned Cost,
4609 std::pair<InstructionCost, MVT> LT,
4611 InstructionCost LegalizationCost = LT.first;
4612 MVT MTy = LT.second;
4613
4614 // If there are no NANs to deal with, then these are reduced to a
4615 // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
4616 // assume is used in the non-fast case.
4617 if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) {
4618 if (FMF.noNaNs())
4619 return LegalizationCost * 1;
4620 }
4621
4622 // For cases where some ops can be folded into a load/store, assume free.
4623 if (MTy.isScalarInteger()) {
4624 if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
4625 if (const Instruction *II = ICA.getInst()) {
4626 if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
4627 return TTI::TCC_Free;
4628 if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
4629 if (LI->hasOneUse())
4630 return TTI::TCC_Free;
4631 }
4632 }
4633 }
4634 }
4635
4636 return LegalizationCost * (int)Cost;
4637 };
4638
4639 // Legalize the type.
4640 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
4641 MVT MTy = LT.second;
4642
4643 // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
4644 if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
4645 (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
4646 !MTy.isVector() && !ICA.isTypeBasedOnly()) {
4647 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
4648 if (auto *Cst = dyn_cast<ConstantInt>(Args[1]))
4649 if (Cst->isAllOnesValue())
4651 }
4652
4653 // FSQRT is a single instruction.
4654 if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize)
4655 return LT.first;
4656
4657 if (ST->useGLMDivSqrtCosts())
4658 if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
4659 if (auto KindCost = Entry->Cost[CostKind])
4660 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4661
4662 if (ST->useSLMArithCosts())
4663 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
4664 if (auto KindCost = Entry->Cost[CostKind])
4665 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4666
4667 if (ST->hasVBMI2())
4668 if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy))
4669 if (auto KindCost = Entry->Cost[CostKind])
4670 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4671
4672 if (ST->hasBITALG())
4673 if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
4674 if (auto KindCost = Entry->Cost[CostKind])
4675 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4676
4677 if (ST->hasVPOPCNTDQ())
4678 if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
4679 if (auto KindCost = Entry->Cost[CostKind])
4680 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4681
4682 if (ST->hasGFNI())
4683 if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
4684 if (auto KindCost = Entry->Cost[CostKind])
4685 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4686
4687 if (ST->hasCDI())
4688 if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
4689 if (auto KindCost = Entry->Cost[CostKind])
4690 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4691
4692 if (ST->hasBWI())
4693 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
4694 if (auto KindCost = Entry->Cost[CostKind])
4695 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4696
4697 if (ST->hasAVX512())
4698 if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
4699 if (auto KindCost = Entry->Cost[CostKind])
4700 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4701
4702 if (ST->hasXOP())
4703 if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
4704 if (auto KindCost = Entry->Cost[CostKind])
4705 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4706
4707 if (ST->hasAVX2())
4708 if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
4709 if (auto KindCost = Entry->Cost[CostKind])
4710 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4711
4712 if (ST->hasAVX())
4713 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
4714 if (auto KindCost = Entry->Cost[CostKind])
4715 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4716
4717 if (ST->hasSSE42())
4718 if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
4719 if (auto KindCost = Entry->Cost[CostKind])
4720 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4721
4722 if (ST->hasSSE41())
4723 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
4724 if (auto KindCost = Entry->Cost[CostKind])
4725 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4726
4727 if (ST->hasSSSE3())
4728 if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
4729 if (auto KindCost = Entry->Cost[CostKind])
4730 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4731
4732 if (ST->hasSSE2())
4733 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
4734 if (auto KindCost = Entry->Cost[CostKind])
4735 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4736
4737 if (ST->hasSSE1())
4738 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
4739 if (auto KindCost = Entry->Cost[CostKind])
4740 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4741
4742 if (ST->hasBMI()) {
4743 if (ST->is64Bit())
4744 if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
4745 if (auto KindCost = Entry->Cost[CostKind])
4746 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4747
4748 if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
4749 if (auto KindCost = Entry->Cost[CostKind])
4750 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4751 }
4752
4753 if (ST->hasLZCNT()) {
4754 if (ST->is64Bit())
4755 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
4756 if (auto KindCost = Entry->Cost[CostKind])
4757 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4758
4759 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
4760 if (auto KindCost = Entry->Cost[CostKind])
4761 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4762 }
4763
4764 if (ST->hasPOPCNT()) {
4765 if (ST->is64Bit())
4766 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
4767 if (auto KindCost = Entry->Cost[CostKind])
4768 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4769
4770 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
4771 if (auto KindCost = Entry->Cost[CostKind])
4772 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4773 }
4774
4775 if (ST->is64Bit())
4776 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
4777 if (auto KindCost = Entry->Cost[CostKind])
4778 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4779
4780 if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
4781 if (auto KindCost = Entry->Cost[CostKind])
4782 return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
4783
4784 // Without arg data, we need to compute the expanded costs of custom lowered
4785 // intrinsics to prevent use of the (very low) default costs.
4786 if (ICA.isTypeBasedOnly() &&
4787 (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) {
4788 Type *CondTy = RetTy->getWithNewBitWidth(1);
4790 Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
4791 Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
4792 Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
4793 Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind);
4794 Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
4795 Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
4797 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
4799 return Cost;
4800 }
4801 }
4802
4804}
4805
4808 unsigned Index, const Value *Op0,
4809 const Value *Op1) const {
4810 static const CostTblEntry SLMCostTbl[] = {
4811 { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
4812 { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
4813 { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
4814 { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
4815 };
4816
4817 assert(Val->isVectorTy() && "This must be a vector type");
4818 Type *ScalarType = Val->getScalarType();
4819 InstructionCost RegisterFileMoveCost = 0;
4820
4821 // Non-immediate extraction/insertion can be handled as a sequence of
4822 // aliased loads+stores via the stack.
4823 if (Index == -1U && (Opcode == Instruction::ExtractElement ||
4824 Opcode == Instruction::InsertElement)) {
4825 // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
4826 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
4827
4828 // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
4829 assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
4830 Align VecAlign = DL.getPrefTypeAlign(Val);
4831 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
4832
4833 // Extract - store vector to stack, load scalar.
4834 if (Opcode == Instruction::ExtractElement) {
4835 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4836 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
4837 CostKind);
4838 }
4839 // Insert - store vector to stack, store scalar, load vector.
4840 if (Opcode == Instruction::InsertElement) {
4841 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
4842 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
4843 CostKind) +
4844 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind);
4845 }
4846 }
4847
4848 if (Index != -1U && (Opcode == Instruction::ExtractElement ||
4849 Opcode == Instruction::InsertElement)) {
4850 // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
4851 if (Opcode == Instruction::ExtractElement &&
4852 ScalarType->getScalarSizeInBits() == 1 &&
4853 cast<FixedVectorType>(Val)->getNumElements() > 1)
4854 return 1;
4855
4856 // Legalize the type.
4857 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4858
4859 // This type is legalized to a scalar type.
4860 if (!LT.second.isVector())
4861 return TTI::TCC_Free;
4862
4863 // The type may be split. Normalize the index to the new type.
4864 unsigned SizeInBits = LT.second.getSizeInBits();
4865 unsigned NumElts = LT.second.getVectorNumElements();
4866 unsigned SubNumElts = NumElts;
4867 Index = Index % NumElts;
4868
4869 // For >128-bit vectors, we need to extract higher 128-bit subvectors.
4870 // For inserts, we also need to insert the subvector back.
4871 if (SizeInBits > 128) {
4872 assert((SizeInBits % 128) == 0 && "Illegal vector");
4873 unsigned NumSubVecs = SizeInBits / 128;
4874 SubNumElts = NumElts / NumSubVecs;
4875 if (SubNumElts <= Index) {
4876 RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
4877 Index %= SubNumElts;
4878 }
4879 }
4880
4881 MVT MScalarTy = LT.second.getScalarType();
4882 auto IsCheapPInsrPExtrInsertPS = [&]() {
4883 // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
4884 // Inserting f32 into index0 is just movss.
4885 // Also, assume insertps is relatively cheap on all >= SSE41 targets.
4886 return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4887 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4888 (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 &&
4889 Opcode == Instruction::InsertElement) ||
4890 (MScalarTy == MVT::f32 && ST->hasSSE41() &&
4891 Opcode == Instruction::InsertElement);
4892 };
4893
4894 if (Index == 0) {
4895 // Floating point scalars are already located in index #0.
4896 // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
4897 // true for all.
4898 if (ScalarType->isFloatingPointTy() &&
4899 (Opcode != Instruction::InsertElement || !Op0 ||
4900 isa<UndefValue>(Op0)))
4901 return RegisterFileMoveCost;
4902
4903 if (Opcode == Instruction::InsertElement &&
4905 // Consider the gather cost to be cheap.
4907 return RegisterFileMoveCost;
4908 if (!IsCheapPInsrPExtrInsertPS()) {
4909 // mov constant-to-GPR + movd/movq GPR -> XMM.
4910 if (isa_and_nonnull<Constant>(Op1) && Op1->getType()->isIntegerTy())
4911 return 2 + RegisterFileMoveCost;
4912 // Assume movd/movq GPR -> XMM is relatively cheap on all targets.
4913 return 1 + RegisterFileMoveCost;
4914 }
4915 }
4916
4917 // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
4918 if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
4919 return 1 + RegisterFileMoveCost;
4920 }
4921
4922 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4923 assert(ISD && "Unexpected vector opcode");
4924 if (ST->useSLMArithCosts())
4925 if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
4926 return Entry->Cost + RegisterFileMoveCost;
4927
4928 // Consider cheap cases.
4929 if (IsCheapPInsrPExtrInsertPS())
4930 return 1 + RegisterFileMoveCost;
4931
4932 // For extractions we just need to shuffle the element to index 0, which
4933 // should be very cheap (assume cost = 1). For insertions we need to shuffle
4934 // the elements to its destination. In both cases we must handle the
4935 // subvector move(s).
4936 // If the vector type is already less than 128-bits then don't reduce it.
4937 // TODO: Under what circumstances should we shuffle using the full width?
4938 InstructionCost ShuffleCost = 1;
4939 if (Opcode == Instruction::InsertElement) {
4940 auto *SubTy = cast<VectorType>(Val);
4941 EVT VT = TLI->getValueType(DL, Val);
4942 if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
4943 SubTy = FixedVectorType::get(ScalarType, SubNumElts);
4944 ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, SubTy, {},
4945 CostKind, 0, SubTy);
4946 }
4947 int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
4948 return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
4949 }
4950
4951 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) +
4952 RegisterFileMoveCost;
4953}
4954
4956 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4957 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
4958 ArrayRef<Value *> VL) const {
4959 assert(DemandedElts.getBitWidth() ==
4960 cast<FixedVectorType>(Ty)->getNumElements() &&
4961 "Vector size mismatch");
4962
4963 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4964 MVT MScalarTy = LT.second.getScalarType();
4965 unsigned LegalVectorBitWidth = LT.second.getSizeInBits();
4967
4968 constexpr unsigned LaneBitWidth = 128;
4969 assert((LegalVectorBitWidth < LaneBitWidth ||
4970 (LegalVectorBitWidth % LaneBitWidth) == 0) &&
4971 "Illegal vector");
4972
4973 const int NumLegalVectors = LT.first.getValue();
4974 assert(NumLegalVectors >= 0 && "Negative cost!");
4975
4976 // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
4977 // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. SLPVectorizer has
4978 // a special heuristic regarding poison input which is passed here in
4979 // ForPoisonSrc.
4980 if (Insert && !ForPoisonSrc) {
4981 // This is nearly identical to BaseT::getScalarizationOverhead(), except
4982 // it is passing nullptr to getVectorInstrCost() for Op0 (instead of
4983 // Constant::getNullValue()), which makes the X86TTIImpl
4984 // getVectorInstrCost() return 0 instead of 1.
4985 for (unsigned I : seq(DemandedElts.getBitWidth())) {
4986 if (!DemandedElts[I])
4987 continue;
4988 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, I,
4990 VL.empty() ? nullptr : VL[I]);
4991 }
4992 return Cost;
4993 }
4994
4995 if (Insert) {
4996 if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
4997 (MScalarTy.isInteger() && ST->hasSSE41()) ||
4998 (MScalarTy == MVT::f32 && ST->hasSSE41())) {
4999 // For types we can insert directly, insertion into 128-bit sub vectors is
5000 // cheap, followed by a cheap chain of concatenations.
5001 if (LegalVectorBitWidth <= LaneBitWidth) {
5002 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
5003 /*Extract*/ false, CostKind);
5004 } else {
5005 // In each 128-lane, if at least one index is demanded but not all
5006 // indices are demanded and this 128-lane is not the first 128-lane of
5007 // the legalized-vector, then this 128-lane needs a extracti128; If in
5008 // each 128-lane, there is at least one demanded index, this 128-lane
5009 // needs a inserti128.
5010
5011 // The following cases will help you build a better understanding:
5012 // Assume we insert several elements into a v8i32 vector in avx2,
5013 // Case#1: inserting into 1th index needs vpinsrd + inserti128.
5014 // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
5015 // inserti128.
5016 // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
5017 assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector");
5018 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5019 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5020 unsigned NumLegalElts =
5021 LT.second.getVectorNumElements() * NumLegalVectors;
5022 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5023 "Vector has been legalized to smaller element count");
5024 assert((NumLegalElts % NumLanesTotal) == 0 &&
5025 "Unexpected elts per lane");
5026 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5027
5028 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5029 auto *LaneTy =
5030 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5031
5032 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5033 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5034 NumEltsPerLane, NumEltsPerLane * I);
5035 if (LaneEltMask.isZero())
5036 continue;
5037 // FIXME: we don't need to extract if all non-demanded elements
5038 // are legalization-inserted padding.
5039 if (!LaneEltMask.isAllOnes())
5041 CostKind, I * NumEltsPerLane, LaneTy);
5042 Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
5043 /*Extract*/ false, CostKind);
5044 }
5045
5046 APInt AffectedLanes =
5047 APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal);
5048 APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask(
5049 AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true);
5050 for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) {
5051 for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) {
5052 unsigned I = NumLegalLanes * LegalVec + Lane;
5053 // No need to insert unaffected lane; or lane 0 of each legal vector
5054 // iff ALL lanes of that vector were affected and will be inserted.
5055 if (!AffectedLanes[I] ||
5056 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
5057 continue;
5059 CostKind, I * NumEltsPerLane, LaneTy);
5060 }
5061 }
5062 }
5063 } else if (LT.second.isVector()) {
5064 // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
5065 // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
5066 // series of UNPCK followed by CONCAT_VECTORS - all of these can be
5067 // considered cheap.
5068 if (Ty->isIntOrIntVectorTy())
5069 Cost += DemandedElts.popcount();
5070
5071 // Get the smaller of the legalized or original pow2-extended number of
5072 // vector elements, which represents the number of unpacks we'll end up
5073 // performing.
5074 unsigned NumElts = LT.second.getVectorNumElements();
5075 unsigned Pow2Elts =
5077 Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
5078 }
5079 }
5080
5081 if (Extract) {
5082 // vXi1 can be efficiently extracted with MOVMSK.
5083 // TODO: AVX512 predicate mask handling.
5084 // NOTE: This doesn't work well for roundtrip scalarization.
5085 if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) {
5086 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
5087 unsigned MaxElts = ST->hasAVX2() ? 32 : 16;
5088 unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts;
5089 return MOVMSKCost;
5090 }
5091
5092 if (LT.second.isVector()) {
5093 unsigned NumLegalElts =
5094 LT.second.getVectorNumElements() * NumLegalVectors;
5095 assert(NumLegalElts >= DemandedElts.getBitWidth() &&
5096 "Vector has been legalized to smaller element count");
5097
5098 // If we're extracting elements from a 128-bit subvector lane,
5099 // we only need to extract each lane once, not for every element.
5100 if (LegalVectorBitWidth > LaneBitWidth) {
5101 unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth;
5102 unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors;
5103 assert((NumLegalElts % NumLanesTotal) == 0 &&
5104 "Unexpected elts per lane");
5105 unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal;
5106
5107 // Add cost for each demanded 128-bit subvector extraction.
5108 // Luckily this is a lot easier than for insertion.
5109 APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts);
5110 auto *LaneTy =
5111 FixedVectorType::get(Ty->getElementType(), NumEltsPerLane);
5112
5113 for (unsigned I = 0; I != NumLanesTotal; ++I) {
5114 APInt LaneEltMask = WidenedDemandedElts.extractBits(
5115 NumEltsPerLane, I * NumEltsPerLane);
5116 if (LaneEltMask.isZero())
5117 continue;
5119 I * NumEltsPerLane, LaneTy);
5121 LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
5122 }
5123
5124 return Cost;
5125 }
5126 }
5127
5128 // Fallback to default extraction.
5129 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false,
5130 Extract, CostKind);
5131 }
5132
5133 return Cost;
5134}
5135
5137X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
5138 int VF, const APInt &DemandedDstElts,
5140 const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
5141 // We don't differentiate element types here, only element bit width.
5142 EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
5143
5144 auto bailout = [&]() {
5145 return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
5146 DemandedDstElts, CostKind);
5147 };
5148
5149 // For now, only deal with AVX512 cases.
5150 if (!ST->hasAVX512())
5151 return bailout();
5152
5153 // Do we have a native shuffle for this element type, or should we promote?
5154 unsigned PromEltTyBits = EltTyBits;
5155 switch (EltTyBits) {
5156 case 32:
5157 case 64:
5158 break; // AVX512F.
5159 case 16:
5160 if (!ST->hasBWI())
5161 PromEltTyBits = 32; // promote to i32, AVX512F.
5162 break; // AVX512BW
5163 case 8:
5164 if (!ST->hasVBMI())
5165 PromEltTyBits = 32; // promote to i32, AVX512F.
5166 break; // AVX512VBMI
5167 case 1:
5168 // There is no support for shuffling i1 elements. We *must* promote.
5169 if (ST->hasBWI()) {
5170 if (ST->hasVBMI())
5171 PromEltTyBits = 8; // promote to i8, AVX512VBMI.
5172 else
5173 PromEltTyBits = 16; // promote to i16, AVX512BW.
5174 break;
5175 }
5176 PromEltTyBits = 32; // promote to i32, AVX512F.
5177 break;
5178 default:
5179 return bailout();
5180 }
5181 auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
5182
5183 auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
5184 auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
5185
5186 int NumDstElements = VF * ReplicationFactor;
5187 auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
5188 auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
5189
5190 // Legalize the types.
5191 MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second;
5192 MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second;
5193 MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second;
5194 MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second;
5195 // They should have legalized into vector types.
5196 if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
5197 !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
5198 return bailout();
5199
5200 if (PromEltTyBits != EltTyBits) {
5201 // If we have to perform the shuffle with wider elt type than our data type,
5202 // then we will first need to anyext (we don't care about the new bits)
5203 // the source elements, and then truncate Dst elements.
5204 InstructionCost PromotionCost;
5205 PromotionCost += getCastInstrCost(
5206 Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
5208 PromotionCost +=
5209 getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
5210 /*Src=*/PromDstVecTy,
5212 return PromotionCost + getReplicationShuffleCost(PromEltTy,
5213 ReplicationFactor, VF,
5214 DemandedDstElts, CostKind);
5215 }
5216
5217 assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
5218 LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
5219 "We expect that the legalization doesn't affect the element width, "
5220 "doesn't coalesce/split elements.");
5221
5222 unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
5223 unsigned NumDstVectors =
5224 divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
5225
5226 auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
5227
5228 // Not all the produced Dst elements may be demanded. In our case,
5229 // given that a single Dst vector is formed by a single shuffle,
5230 // if all elements that will form a single Dst vector aren't demanded,
5231 // then we won't need to do that shuffle, so adjust the cost accordingly.
5232 APInt DemandedDstVectors = APIntOps::ScaleBitMask(
5233 DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
5234 unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
5235
5236 InstructionCost SingleShuffleCost =
5237 getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, SingleDstVecTy,
5238 /*Mask=*/{}, CostKind,
5239 /*Index=*/0, /*SubTp=*/nullptr);
5240 return NumDstVectorsDemanded * SingleShuffleCost;
5241}
5242
5244 Align Alignment,
5245 unsigned AddressSpace,
5247 TTI::OperandValueInfo OpInfo,
5248 const Instruction *I) const {
5249 // TODO: Handle other cost kinds.
5251 if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
5252 // Store instruction with index and scale costs 2 Uops.
5253 // Check the preceding GEP to identify non-const indices.
5254 if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
5255 if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
5256 return TTI::TCC_Basic * 2;
5257 }
5258 }
5259 return TTI::TCC_Basic;
5260 }
5261
5262 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5263 "Invalid Opcode");
5264 // Type legalization can't handle structs
5265 if (TLI->getValueType(DL, Src, true) == MVT::Other)
5266 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5267 CostKind, OpInfo, I);
5268
5269 // Legalize the type.
5270 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
5271
5272 auto *VTy = dyn_cast<FixedVectorType>(Src);
5273
5275
5276 // Add a cost for constant load to vector.
5277 if (Opcode == Instruction::Store && OpInfo.isConstant())
5278 Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src),
5279 /*AddressSpace=*/0, CostKind, OpInfo);
5280
5281 // Handle the simple case of non-vectors.
5282 // NOTE: this assumes that legalization never creates vector from scalars!
5283 if (!VTy || !LT.second.isVector()) {
5284 // Each load/store unit costs 1.
5285 return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1;
5286 }
5287
5288 bool IsLoad = Opcode == Instruction::Load;
5289
5290 Type *EltTy = VTy->getElementType();
5291
5292 const int EltTyBits = DL.getTypeSizeInBits(EltTy);
5293
5294 // Source of truth: how many elements were there in the original IR vector?
5295 const unsigned SrcNumElt = VTy->getNumElements();
5296
5297 // How far have we gotten?
5298 int NumEltRemaining = SrcNumElt;
5299 // Note that we intentionally capture by-reference, NumEltRemaining changes.
5300 auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
5301
5302 const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
5303
5304 // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
5305 const unsigned XMMBits = 128;
5306 if (XMMBits % EltTyBits != 0)
5307 // Vector size must be a multiple of the element size. I.e. no padding.
5308 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5309 CostKind, OpInfo, I);
5310 const int NumEltPerXMM = XMMBits / EltTyBits;
5311
5312 auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
5313
5314 for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
5315 NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
5316 // How many elements would a single op deal with at once?
5317 if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
5318 // Vector size must be a multiple of the element size. I.e. no padding.
5319 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
5320 CostKind, OpInfo, I);
5321 int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
5322
5323 assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
5324 assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
5325 (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
5326 "Unless we haven't halved the op size yet, "
5327 "we have less than two op's sized units of work left.");
5328
5329 auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
5330 ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
5331 : XMMVecTy;
5332
5333 assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
5334 "After halving sizes, the vector elt count is no longer a multiple "
5335 "of number of elements per operation?");
5336 auto *CoalescedVecTy =
5337 CurrNumEltPerOp == 1
5338 ? CurrVecTy
5340 IntegerType::get(Src->getContext(),
5341 EltTyBits * CurrNumEltPerOp),
5342 CurrVecTy->getNumElements() / CurrNumEltPerOp);
5343 assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
5344 DL.getTypeSizeInBits(CurrVecTy) &&
5345 "coalesciing elements doesn't change vector width.");
5346
5347 while (NumEltRemaining > 0) {
5348 assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
5349
5350 // Can we use this vector size, as per the remaining element count?
5351 // Iff the vector is naturally aligned, we can do a wide load regardless.
5352 if (NumEltRemaining < CurrNumEltPerOp &&
5353 (!IsLoad || Alignment < CurrOpSizeBytes) && CurrOpSizeBytes != 1)
5354 break; // Try smalled vector size.
5355
5356 // This isn't exactly right. We're using slow unaligned 32-byte accesses
5357 // as a proxy for a double-pumped AVX memory interface such as on
5358 // Sandybridge.
5359 // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or
5360 // will be scalarized.
5361 if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
5362 Cost += 2;
5363 else if (CurrOpSizeBytes < 4)
5364 Cost += 2;
5365 else
5366 Cost += 1;
5367
5368 // If we're loading a uniform value, then we don't need to split the load,
5369 // loading just a single (widest) vector can be reused by all splits.
5370 if (IsLoad && OpInfo.isUniform())
5371 return Cost;
5372
5373 bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
5374
5375 // If we have fully processed the previous reg, we need to replenish it.
5376 if (SubVecEltsLeft == 0) {
5377 SubVecEltsLeft += CurrVecTy->getNumElements();
5378 // And that's free only for the 0'th subvector of a legalized vector.
5379 if (!Is0thSubVec)
5380 Cost +=
5383 VTy, VTy, {}, CostKind, NumEltDone(), CurrVecTy);
5384 }
5385
5386 // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
5387 // for smaller widths (32/16/8) we have to insert/extract them separately.
5388 // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
5389 // but let's pretend that it is also true for 16/8 bit wide ops...)
5390 if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
5391 int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
5392 assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
5393 int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
5394 APInt DemandedElts =
5395 APInt::getBitsSet(CoalescedVecTy->getNumElements(),
5396 CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
5397 assert(DemandedElts.popcount() == 1 && "Inserting single value");
5398 Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
5399 !IsLoad, CostKind);
5400 }
5401
5402 SubVecEltsLeft -= CurrNumEltPerOp;
5403 NumEltRemaining -= CurrNumEltPerOp;
5404 Alignment = commonAlignment(Alignment, CurrOpSizeBytes);
5405 }
5406 }
5407
5408 assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
5409
5410 return Cost;
5411}
5412
5414X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
5415 unsigned AddressSpace,
5417 bool IsLoad = (Instruction::Load == Opcode);
5418 bool IsStore = (Instruction::Store == Opcode);
5419
5420 auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
5421 if (!SrcVTy)
5422 // To calculate scalar take the regular cost, without mask
5423 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
5424
5425 unsigned NumElem = SrcVTy->getNumElements();
5426 auto *MaskTy =
5427 FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
5428 if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment, AddressSpace)) ||
5429 (IsStore && !isLegalMaskedStore(SrcVTy, Alignment, AddressSpace))) {
5430 // Scalarization
5431 APInt DemandedElts = APInt::getAllOnes(NumElem);
5433 MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind);
5434 InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5435 Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
5437 InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5438 InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
5440 SrcVTy, DemandedElts, IsLoad, IsStore, CostKind);
5441 InstructionCost MemopCost =
5442 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
5443 Alignment, AddressSpace, CostKind);
5444 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
5445 }
5446
5447 // Legalize the type.
5448 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
5449 auto VT = TLI->getValueType(DL, SrcVTy);
5451 MVT Ty = LT.second;
5452 if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
5453 // APX masked load/store for scalar is cheap.
5454 return Cost + LT.first;
5455
5456 if (VT.isSimple() && Ty != VT.getSimpleVT() &&
5457 LT.second.getVectorNumElements() == NumElem)
5458 // Promotion requires extend/truncate for data and a shuffle for mask.
5459 Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, SrcVTy, {}, CostKind,
5460 0, nullptr) +
5461 getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, MaskTy, {}, CostKind,
5462 0, nullptr);
5463
5464 else if (LT.first * Ty.getVectorNumElements() > NumElem) {
5465 auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5466 (unsigned)LT.first.getValue() *
5467 Ty.getVectorNumElements());
5468 // Expanding requires fill mask with zeroes
5469 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, NewMaskTy, {},
5470 CostKind, 0, MaskTy);
5471 }
5472
5473 // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
5474 if (!ST->hasAVX512())
5475 return Cost + LT.first * (IsLoad ? 2 : 8);
5476
5477 // AVX-512 masked load/store is cheaper
5478 return Cost + LT.first;
5479}
5480
5482 ArrayRef<const Value *> Ptrs, const Value *Base,
5483 const TTI::PointersChainInfo &Info, Type *AccessTy,
5485 if (Info.isSameBase() && Info.isKnownStride()) {
5486 // If all the pointers have known stride all the differences are translated
5487 // into constants. X86 memory addressing allows encoding it into
5488 // displacement. So we just need to take the base GEP cost.
5489 if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Base)) {
5490 SmallVector<const Value *> Indices(BaseGEP->indices());
5491 return getGEPCost(BaseGEP->getSourceElementType(),
5492 BaseGEP->getPointerOperand(), Indices, nullptr,
5493 CostKind);
5494 }
5495 return TTI::TCC_Free;
5496 }
5497 return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind);
5498}
5499
5502 const SCEV *Ptr,
5504 // Address computations in vectorized code with non-consecutive addresses will
5505 // likely result in more instructions compared to scalar code where the
5506 // computation can more often be merged into the index mode. The resulting
5507 // extra micro-ops can significantly decrease throughput.
5508 const unsigned NumVectorInstToHideOverhead = 10;
5509
5510 // Cost modeling of Strided Access Computation is hidden by the indexing
5511 // modes of X86 regardless of the stride value. We dont believe that there
5512 // is a difference between constant strided access in gerenal and constant
5513 // strided value which is less than or equal to 64.
5514 // Even in the case of (loop invariant) stride whose value is not known at
5515 // compile time, the address computation will not incur more than one extra
5516 // ADD instruction.
5517 if (PtrTy->isVectorTy() && SE && !ST->hasAVX2()) {
5518 // TODO: AVX2 is the current cut-off because we don't have correct
5519 // interleaving costs for prior ISA's.
5521 return NumVectorInstToHideOverhead;
5523 return 1;
5524 }
5525
5526 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
5527}
5528
5531 std::optional<FastMathFlags> FMF,
5534 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5535
5536 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5537 // and make it as the cost.
5538
5539 static const CostTblEntry SLMCostTbl[] = {
5540 { ISD::FADD, MVT::v2f64, 3 },
5541 { ISD::ADD, MVT::v2i64, 5 },
5542 };
5543
5544 static const CostTblEntry SSE2CostTbl[] = {
5545 { ISD::FADD, MVT::v2f64, 2 },
5546 { ISD::FADD, MVT::v2f32, 2 },
5547 { ISD::FADD, MVT::v4f32, 4 },
5548 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
5549 { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
5550 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
5551 { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
5552 { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
5553 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
5554 { ISD::ADD, MVT::v2i8, 2 },
5555 { ISD::ADD, MVT::v4i8, 2 },
5556 { ISD::ADD, MVT::v8i8, 2 },
5557 { ISD::ADD, MVT::v16i8, 3 },
5558 };
5559
5560 static const CostTblEntry AVX1CostTbl[] = {
5561 { ISD::FADD, MVT::v4f64, 3 },
5562 { ISD::FADD, MVT::v4f32, 3 },
5563 { ISD::FADD, MVT::v8f32, 4 },
5564 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
5565 { ISD::ADD, MVT::v4i64, 3 },
5566 { ISD::ADD, MVT::v8i32, 5 },
5567 { ISD::ADD, MVT::v16i16, 5 },
5568 { ISD::ADD, MVT::v32i8, 4 },
5569 };
5570
5571 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5572 assert(ISD && "Invalid opcode");
5573
5574 // Before legalizing the type, give a chance to look up illegal narrow types
5575 // in the table.
5576 // FIXME: Is there a better way to do this?
5577 EVT VT = TLI->getValueType(DL, ValTy);
5578 if (VT.isSimple()) {
5579 MVT MTy = VT.getSimpleVT();
5580 if (ST->useSLMArithCosts())
5581 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5582 return Entry->Cost;
5583
5584 if (ST->hasAVX())
5585 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5586 return Entry->Cost;
5587
5588 if (ST->hasSSE2())
5589 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5590 return Entry->Cost;
5591 }
5592
5593 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5594
5595 MVT MTy = LT.second;
5596
5597 auto *ValVTy = cast<FixedVectorType>(ValTy);
5598
5599 // Special case: vXi8 mul reductions are performed as vXi16.
5600 if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
5601 auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
5602 auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
5603 return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
5605 CostKind) +
5606 getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
5607 }
5608
5609 InstructionCost ArithmeticCost = 0;
5610 if (LT.first != 1 && MTy.isVector() &&
5611 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5612 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5613 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5614 MTy.getVectorNumElements());
5615 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5616 ArithmeticCost *= LT.first - 1;
5617 }
5618
5619 if (ST->useSLMArithCosts())
5620 if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
5621 return ArithmeticCost + Entry->Cost;
5622
5623 if (ST->hasAVX())
5624 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5625 return ArithmeticCost + Entry->Cost;
5626
5627 if (ST->hasSSE2())
5628 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5629 return ArithmeticCost + Entry->Cost;
5630
5631 // FIXME: These assume a naive kshift+binop lowering, which is probably
5632 // conservative in most cases.
5633 static const CostTblEntry AVX512BoolReduction[] = {
5634 { ISD::AND, MVT::v2i1, 3 },
5635 { ISD::AND, MVT::v4i1, 5 },
5636 { ISD::AND, MVT::v8i1, 7 },
5637 { ISD::AND, MVT::v16i1, 9 },
5638 { ISD::AND, MVT::v32i1, 11 },
5639 { ISD::AND, MVT::v64i1, 13 },
5640 { ISD::OR, MVT::v2i1, 3 },
5641 { ISD::OR, MVT::v4i1, 5 },
5642 { ISD::OR, MVT::v8i1, 7 },
5643 { ISD::OR, MVT::v16i1, 9 },
5644 { ISD::OR, MVT::v32i1, 11 },
5645 { ISD::OR, MVT::v64i1, 13 },
5646 };
5647
5648 static const CostTblEntry AVX2BoolReduction[] = {
5649 { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
5650 { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
5651 { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
5652 { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
5653 };
5654
5655 static const CostTblEntry AVX1BoolReduction[] = {
5656 { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
5657 { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
5658 { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5659 { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
5660 { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
5661 { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
5662 { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5663 { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
5664 };
5665
5666 static const CostTblEntry SSE2BoolReduction[] = {
5667 { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
5668 { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
5669 { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
5670 { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
5671 { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
5672 { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
5673 { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
5674 { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
5675 };
5676
5677 // Handle bool allof/anyof patterns.
5678 if (ValVTy->getElementType()->isIntegerTy(1)) {
5679 InstructionCost ArithmeticCost = 0;
5680 if (LT.first != 1 && MTy.isVector() &&
5681 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5682 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5683 auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
5684 MTy.getVectorNumElements());
5685 ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
5686 ArithmeticCost *= LT.first - 1;
5687 }
5688
5689 if (ST->hasAVX512())
5690 if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
5691 return ArithmeticCost + Entry->Cost;
5692 if (ST->hasAVX2())
5693 if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
5694 return ArithmeticCost + Entry->Cost;
5695 if (ST->hasAVX())
5696 if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
5697 return ArithmeticCost + Entry->Cost;
5698 if (ST->hasSSE2())
5699 if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
5700 return ArithmeticCost + Entry->Cost;
5701
5702 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5703 }
5704
5705 unsigned NumVecElts = ValVTy->getNumElements();
5706 unsigned ScalarSize = ValVTy->getScalarSizeInBits();
5707
5708 // Special case power of 2 reductions where the scalar type isn't changed
5709 // by type legalization.
5710 if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
5711 return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
5712
5713 InstructionCost ReductionCost = 0;
5714
5715 auto *Ty = ValVTy;
5716 if (LT.first != 1 && MTy.isVector() &&
5717 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5718 // Type needs to be split. We need LT.first - 1 arithmetic ops.
5719 Ty = FixedVectorType::get(ValVTy->getElementType(),
5720 MTy.getVectorNumElements());
5721 ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5722 ReductionCost *= LT.first - 1;
5723 NumVecElts = MTy.getVectorNumElements();
5724 }
5725
5726 // Now handle reduction with the legal type, taking into account size changes
5727 // at each level.
5728 while (NumVecElts > 1) {
5729 // Determine the size of the remaining vector we need to reduce.
5730 unsigned Size = NumVecElts * ScalarSize;
5731 NumVecElts /= 2;
5732 // If we're reducing from 256/512 bits, use an extract_subvector.
5733 if (Size > 128) {
5734 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5735 ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
5736 CostKind, NumVecElts, SubTy);
5737 Ty = SubTy;
5738 } else if (Size == 128) {
5739 // Reducing from 128 bits is a permute of v2f64/v2i64.
5740 FixedVectorType *ShufTy;
5741 if (ValVTy->isFloatingPointTy())
5742 ShufTy =
5743 FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
5744 else
5745 ShufTy =
5746 FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
5747 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5748 {}, CostKind, 0, nullptr);
5749 } else if (Size == 64) {
5750 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5751 FixedVectorType *ShufTy;
5752 if (ValVTy->isFloatingPointTy())
5753 ShufTy =
5754 FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
5755 else
5756 ShufTy =
5757 FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
5758 ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
5759 {}, CostKind, 0, nullptr);
5760 } else {
5761 // Reducing from smaller size is a shift by immediate.
5762 auto *ShiftTy = FixedVectorType::get(
5763 Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
5764 ReductionCost += getArithmeticInstrCost(
5765 Instruction::LShr, ShiftTy, CostKind,
5768 }
5769
5770 // Add the arithmetic op for this level.
5771 ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
5772 }
5773
5774 // Add the final extract element to the cost.
5775 return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5776 CostKind, 0, nullptr, nullptr);
5777}
5778
5781 FastMathFlags FMF) const {
5782 IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF);
5783 return getIntrinsicInstrCost(ICA, CostKind);
5784}
5785
5788 FastMathFlags FMF,
5790 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5791
5792 MVT MTy = LT.second;
5793
5794 int ISD;
5795 if (ValTy->isIntOrIntVectorTy()) {
5796 ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN
5797 : ISD::SMIN;
5798 } else {
5799 assert(ValTy->isFPOrFPVectorTy() &&
5800 "Expected float point or integer vector type.");
5801 ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum)
5802 ? ISD::FMINNUM
5803 : ISD::FMINIMUM;
5804 }
5805
5806 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
5807 // and make it as the cost.
5808
5809 static const CostTblEntry SSE2CostTbl[] = {
5810 {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
5811 {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
5812 {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
5813 };
5814
5815 static const CostTblEntry SSE41CostTbl[] = {
5816 {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
5817 {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
5818 {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
5819 {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
5820 {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
5821 {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
5822 {ISD::SMIN, MVT::v2i8, 3}, // pminsb
5823 {ISD::SMIN, MVT::v4i8, 5}, // pminsb
5824 {ISD::SMIN, MVT::v8i8, 7}, // pminsb
5825 {ISD::SMIN, MVT::v16i8, 6},
5826 {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
5827 {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
5828 {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
5829 {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
5830 };
5831
5832 static const CostTblEntry AVX1CostTbl[] = {
5833 {ISD::SMIN, MVT::v16i16, 6},
5834 {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
5835 {ISD::SMIN, MVT::v32i8, 8},
5836 {ISD::UMIN, MVT::v32i8, 8},
5837 };
5838
5839 static const CostTblEntry AVX512BWCostTbl[] = {
5840 {ISD::SMIN, MVT::v32i16, 8},
5841 {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
5842 {ISD::SMIN, MVT::v64i8, 10},
5843 {ISD::UMIN, MVT::v64i8, 10},
5844 };
5845
5846 // Before legalizing the type, give a chance to look up illegal narrow types
5847 // in the table.
5848 // FIXME: Is there a better way to do this?
5849 EVT VT = TLI->getValueType(DL, ValTy);
5850 if (VT.isSimple()) {
5851 MVT MTy = VT.getSimpleVT();
5852 if (ST->hasBWI())
5853 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5854 return Entry->Cost;
5855
5856 if (ST->hasAVX())
5857 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5858 return Entry->Cost;
5859
5860 if (ST->hasSSE41())
5861 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5862 return Entry->Cost;
5863
5864 if (ST->hasSSE2())
5865 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5866 return Entry->Cost;
5867 }
5868
5869 auto *ValVTy = cast<FixedVectorType>(ValTy);
5870 unsigned NumVecElts = ValVTy->getNumElements();
5871
5872 auto *Ty = ValVTy;
5873 InstructionCost MinMaxCost = 0;
5874 if (LT.first != 1 && MTy.isVector() &&
5875 MTy.getVectorNumElements() < ValVTy->getNumElements()) {
5876 // Type needs to be split. We need LT.first - 1 operations ops.
5877 Ty = FixedVectorType::get(ValVTy->getElementType(),
5878 MTy.getVectorNumElements());
5879 MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF);
5880 MinMaxCost *= LT.first - 1;
5881 NumVecElts = MTy.getVectorNumElements();
5882 }
5883
5884 if (ST->hasBWI())
5885 if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
5886 return MinMaxCost + Entry->Cost;
5887
5888 if (ST->hasAVX())
5889 if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
5890 return MinMaxCost + Entry->Cost;
5891
5892 if (ST->hasSSE41())
5893 if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
5894 return MinMaxCost + Entry->Cost;
5895
5896 if (ST->hasSSE2())
5897 if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
5898 return MinMaxCost + Entry->Cost;
5899
5900 unsigned ScalarSize = ValTy->getScalarSizeInBits();
5901
5902 // Special case power of 2 reductions where the scalar type isn't changed
5903 // by type legalization.
5904 if (!isPowerOf2_32(ValVTy->getNumElements()) ||
5905 ScalarSize != MTy.getScalarSizeInBits())
5906 return BaseT::getMinMaxReductionCost(IID, ValTy, FMF, CostKind);
5907
5908 // Now handle reduction with the legal type, taking into account size changes
5909 // at each level.
5910 while (NumVecElts > 1) {
5911 // Determine the size of the remaining vector we need to reduce.
5912 unsigned Size = NumVecElts * ScalarSize;
5913 NumVecElts /= 2;
5914 // If we're reducing from 256/512 bits, use an extract_subvector.
5915 if (Size > 128) {
5916 auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
5917 MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
5918 CostKind, NumVecElts, SubTy);
5919 Ty = SubTy;
5920 } else if (Size == 128) {
5921 // Reducing from 128 bits is a permute of v2f64/v2i64.
5922 VectorType *ShufTy;
5923 if (ValTy->isFloatingPointTy())
5924 ShufTy =
5925 FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
5926 else
5927 ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
5928 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
5929 CostKind, 0, nullptr);
5930 } else if (Size == 64) {
5931 // Reducing from 64 bits is a shuffle of v4f32/v4i32.
5932 FixedVectorType *ShufTy;
5933 if (ValTy->isFloatingPointTy())
5934 ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
5935 else
5936 ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
5937 MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
5938 CostKind, 0, nullptr);
5939 } else {
5940 // Reducing from smaller size is a shift by immediate.
5941 auto *ShiftTy = FixedVectorType::get(
5942 Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
5943 MinMaxCost += getArithmeticInstrCost(
5944 Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
5947 }
5948
5949 // Add the arithmetic op for this level.
5950 MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF);
5951 }
5952
5953 // Add the final extract element to the cost.
5954 return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty,
5955 CostKind, 0, nullptr, nullptr);
5956}
5957
5958/// Calculate the cost of materializing a 64-bit value. This helper
5959/// method might only calculate a fraction of a larger immediate. Therefore it
5960/// is valid to return a cost of ZERO.
5962 if (Val == 0)
5963 return TTI::TCC_Free;
5964
5965 if (isInt<32>(Val))
5966 return TTI::TCC_Basic;
5967
5968 return 2 * TTI::TCC_Basic;
5969}
5970
5973 assert(Ty->isIntegerTy());
5974
5975 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5976 if (BitSize == 0)
5977 return ~0U;
5978
5979 // Never hoist constants larger than 128bit, because this might lead to
5980 // incorrect code generation or assertions in codegen.
5981 // Fixme: Create a cost model for types larger than i128 once the codegen
5982 // issues have been fixed.
5983 if (BitSize > 128)
5984 return TTI::TCC_Free;
5985
5986 if (Imm == 0)
5987 return TTI::TCC_Free;
5988
5989 // Sign-extend all constants to a multiple of 64-bit.
5990 APInt ImmVal = Imm;
5991 if (BitSize % 64 != 0)
5992 ImmVal = Imm.sext(alignTo(BitSize, 64));
5993
5994 // Split the constant into 64-bit chunks and calculate the cost for each
5995 // chunk.
5997 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
5998 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
5999 int64_t Val = Tmp.getSExtValue();
6000 Cost += getIntImmCost(Val);
6001 }
6002 // We need at least one instruction to materialize the constant.
6003 return std::max<InstructionCost>(1, Cost);
6004}
6005
6007 const APInt &Imm, Type *Ty,
6009 Instruction *Inst) const {
6010 assert(Ty->isIntegerTy());
6011
6012 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6013 unsigned ImmBitWidth = Imm.getBitWidth();
6014
6015 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6016 // here, so that constant hoisting will ignore this constant.
6017 if (BitSize == 0)
6018 return TTI::TCC_Free;
6019
6020 unsigned ImmIdx = ~0U;
6021 switch (Opcode) {
6022 default:
6023 return TTI::TCC_Free;
6024 case Instruction::GetElementPtr:
6025 // Always hoist the base address of a GetElementPtr. This prevents the
6026 // creation of new constants for every base constant that gets constant
6027 // folded with the offset.
6028 if (Idx == 0)
6029 return 2 * TTI::TCC_Basic;
6030 return TTI::TCC_Free;
6031 case Instruction::Store:
6032 ImmIdx = 0;
6033 break;
6034 case Instruction::ICmp:
6035 // This is an imperfect hack to prevent constant hoisting of
6036 // compares that might be trying to check if a 64-bit value fits in
6037 // 32-bits. The backend can optimize these cases using a right shift by 32.
6038 // There are other predicates and immediates the backend can use shifts for.
6039 if (Idx == 1 && ImmBitWidth == 64) {
6040 uint64_t ImmVal = Imm.getZExtValue();
6041 if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
6042 return TTI::TCC_Free;
6043
6044 if (auto *Cmp = dyn_cast_or_null<CmpInst>(Inst)) {
6045 if (Cmp->isEquality()) {
6046 KnownBits Known = computeKnownBits(Cmp->getOperand(0), DL);
6047 if (Known.countMinTrailingZeros() >= 32)
6048 return TTI::TCC_Free;
6049 }
6050 }
6051 }
6052 ImmIdx = 1;
6053 break;
6054 case Instruction::And:
6055 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
6056 // by using a 32-bit operation with implicit zero extension. Detect such
6057 // immediates here as the normal path expects bit 31 to be sign extended.
6058 if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(32))
6059 return TTI::TCC_Free;
6060 // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits.
6061 if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() &&
6062 Imm.isMask())
6063 return X86TTIImpl::getIntImmCost(ST->hasBMI2() ? 255 : 65535);
6064 ImmIdx = 1;
6065 break;
6066 case Instruction::Add:
6067 case Instruction::Sub:
6068 // For add/sub, we can use the opposite instruction for INT32_MIN.
6069 if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000)
6070 return TTI::TCC_Free;
6071 ImmIdx = 1;
6072 break;
6073 case Instruction::UDiv:
6074 case Instruction::SDiv:
6075 case Instruction::URem:
6076 case Instruction::SRem:
6077 // Division by constant is typically expanded later into a different
6078 // instruction sequence. This completely changes the constants.
6079 // Report them as "free" to stop ConstantHoist from marking them as opaque.
6080 return TTI::TCC_Free;
6081 case Instruction::Mul:
6082 case Instruction::Or:
6083 case Instruction::Xor:
6084 ImmIdx = 1;
6085 break;
6086 // Always return TCC_Free for the shift value of a shift instruction.
6087 case Instruction::Shl:
6088 case Instruction::LShr:
6089 case Instruction::AShr:
6090 if (Idx == 1)
6091 return TTI::TCC_Free;
6092 break;
6093 case Instruction::Trunc:
6094 case Instruction::ZExt:
6095 case Instruction::SExt:
6096 case Instruction::IntToPtr:
6097 case Instruction::PtrToInt:
6098 case Instruction::BitCast:
6099 case Instruction::PHI:
6100 case Instruction::Call:
6101 case Instruction::Select:
6102 case Instruction::Ret:
6103 case Instruction::Load:
6104 break;
6105 }
6106
6107 if (Idx == ImmIdx) {
6108 uint64_t NumConstants = divideCeil(BitSize, 64);
6110 return (Cost <= NumConstants * TTI::TCC_Basic)
6111 ? static_cast<int>(TTI::TCC_Free)
6112 : Cost;
6113 }
6114
6115 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6116}
6117
6120 const APInt &Imm, Type *Ty,
6122 assert(Ty->isIntegerTy());
6123
6124 unsigned BitSize = Ty->getPrimitiveSizeInBits();
6125 // There is no cost model for constants with a bit size of 0. Return TCC_Free
6126 // here, so that constant hoisting will ignore this constant.
6127 if (BitSize == 0)
6128 return TTI::TCC_Free;
6129
6130 switch (IID) {
6131 default:
6132 return TTI::TCC_Free;
6133 case Intrinsic::sadd_with_overflow:
6134 case Intrinsic::uadd_with_overflow:
6135 case Intrinsic::ssub_with_overflow:
6136 case Intrinsic::usub_with_overflow:
6137 case Intrinsic::smul_with_overflow:
6138 case Intrinsic::umul_with_overflow:
6139 if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32))
6140 return TTI::TCC_Free;
6141 break;
6142 case Intrinsic::experimental_stackmap:
6143 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6144 return TTI::TCC_Free;
6145 break;
6146 case Intrinsic::experimental_patchpoint_void:
6147 case Intrinsic::experimental_patchpoint:
6148 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64)))
6149 return TTI::TCC_Free;
6150 break;
6151 }
6152 return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
6153}
6154
6157 const Instruction *I) const {
6159 return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic;
6160 // Branches are assumed to be predicted.
6161 return TTI::TCC_Free;
6162}
6163
6164int X86TTIImpl::getGatherOverhead() const {
6165 // Some CPUs have more overhead for gather. The specified overhead is relative
6166 // to the Load operation. "2" is the number provided by Intel architects. This
6167 // parameter is used for cost estimation of Gather Op and comparison with
6168 // other alternatives.
6169 // TODO: Remove the explicit hasAVX512()?, That would mean we would only
6170 // enable gather with a -march.
6171 if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
6172 return 2;
6173
6174 return 1024;
6175}
6176
6177int X86TTIImpl::getScatterOverhead() const {
6178 if (ST->hasAVX512())
6179 return 2;
6180
6181 return 1024;
6182}
6183
6184// Return an average cost of Gather / Scatter instruction, maybe improved later.
6185InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode,
6187 Type *SrcVTy, const Value *Ptr,
6188 Align Alignment,
6189 unsigned AddressSpace) const {
6190
6191 assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
6192 unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
6193
6194 // Try to reduce index size from 64 bit (default for GEP)
6195 // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
6196 // operation will use 16 x 64 indices which do not fit in a zmm and needs
6197 // to split. Also check that the base pointer is the same for all lanes,
6198 // and that there's at most one variable index.
6199 auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
6200 unsigned IndexSize = DL.getPointerSizeInBits();
6201 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
6202 if (IndexSize < 64 || !GEP)
6203 return IndexSize;
6204
6205 unsigned NumOfVarIndices = 0;
6206 const Value *Ptrs = GEP->getPointerOperand();
6207 if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
6208 return IndexSize;
6209 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
6210 if (isa<Constant>(GEP->getOperand(I)))
6211 continue;
6212 Type *IndxTy = GEP->getOperand(I)->getType();
6213 if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
6214 IndxTy = IndexVTy->getElementType();
6215 if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
6216 !isa<SExtInst>(GEP->getOperand(I))) ||
6217 ++NumOfVarIndices > 1)
6218 return IndexSize; // 64
6219 }
6220 return (unsigned)32;
6221 };
6222
6223 // Trying to reduce IndexSize to 32 bits for vector 16.
6224 // By default the IndexSize is equal to pointer size.
6225 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
6226 ? getIndexSizeInBits(Ptr, DL)
6227 : DL.getPointerSizeInBits();
6228
6229 auto *IndexVTy = FixedVectorType::get(
6230 IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
6231 std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy);
6232 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy);
6233 InstructionCost::CostType SplitFactor =
6234 std::max(IdxsLT.first, SrcLT.first).getValue();
6235 if (SplitFactor > 1) {
6236 // Handle splitting of vector of pointers
6237 auto *SplitSrcTy =
6238 FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
6239 return SplitFactor * getGSVectorCost(Opcode, CostKind, SplitSrcTy, Ptr,
6240 Alignment, AddressSpace);
6241 }
6242
6243 // If we didn't split, this will be a single gather/scatter instruction.
6245 return 1;
6246
6247 // The gather / scatter cost is given by Intel architects. It is a rough
6248 // number since we are looking at one instruction in a time.
6249 const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead()
6250 : getScatterOverhead();
6251 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
6252 Alignment, AddressSpace, CostKind);
6253}
6254
6255/// Calculate the cost of Gather / Scatter operation
6257 unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
6259 const Instruction *I = nullptr) const {
6260 if ((Opcode == Instruction::Load &&
6261 (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
6263 Align(Alignment)))) ||
6264 (Opcode == Instruction::Store &&
6265 (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
6267 Align(Alignment)))))
6268 return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
6269 Alignment, CostKind, I);
6270
6271 assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
6272 PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
6273 if (!PtrTy && Ptr->getType()->isVectorTy())
6274 PtrTy = dyn_cast<PointerType>(
6275 cast<VectorType>(Ptr->getType())->getElementType());
6276 assert(PtrTy && "Unexpected type for Ptr argument");
6277 unsigned AddressSpace = PtrTy->getAddressSpace();
6278 return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment,
6279 AddressSpace);
6280}
6281
6283 const TargetTransformInfo::LSRCost &C2) const {
6284 // X86 specific here are "instruction number 1st priority".
6285 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
6286 C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6287 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
6288 C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6289}
6290
6292 return ST->hasMacroFusion() || ST->hasBranchFusion();
6293}
6294
6295static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST) {
6296 if (!ST->hasAVX())
6297 return false;
6298
6299 if (ScalarTy->isPointerTy())
6300 return true;
6301
6302 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6303 return true;
6304
6305 if (ScalarTy->isHalfTy() && ST->hasBWI())
6306 return true;
6307
6308 if (ScalarTy->isBFloatTy() && ST->hasBF16())
6309 return true;
6310
6311 if (!ScalarTy->isIntegerTy())
6312 return false;
6313
6314 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6315 return IntWidth == 32 || IntWidth == 64 ||
6316 ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
6317}
6318
6320 unsigned AddressSpace) const {
6321 Type *ScalarTy = DataTy->getScalarType();
6322
6323 // The backend can't handle a single element vector w/o CFCMOV.
6324 if (isa<VectorType>(DataTy) &&
6325 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6326 return ST->hasCF() &&
6327 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/false);
6328
6329 return isLegalMaskedLoadStore(ScalarTy, ST);
6330}
6331
6333 unsigned AddressSpace) const {
6334 Type *ScalarTy = DataTy->getScalarType();
6335
6336 // The backend can't handle a single element vector w/o CFCMOV.
6337 if (isa<VectorType>(DataTy) &&
6338 cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6339 return ST->hasCF() &&
6340 hasConditionalLoadStoreForType(ScalarTy, /*IsStore=*/true);
6341
6342 return isLegalMaskedLoadStore(ScalarTy, ST);
6343}
6344
6345bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) const {
6346 unsigned DataSize = DL.getTypeStoreSize(DataType);
6347 // The only supported nontemporal loads are for aligned vectors of 16 or 32
6348 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
6349 // (the equivalent stores only require AVX).
6350 if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
6351 return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
6352
6353 return false;
6354}
6355
6356bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) const {
6357 unsigned DataSize = DL.getTypeStoreSize(DataType);
6358
6359 // SSE4A supports nontemporal stores of float and double at arbitrary
6360 // alignment.
6361 if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
6362 return true;
6363
6364 // Besides the SSE4A subtarget exception above, only aligned stores are
6365 // available nontemporaly on any other subtarget. And only stores with a size
6366 // of 4..32 bytes (powers of 2, only) are permitted.
6367 if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
6369 return false;
6370
6371 // 32-byte vector nontemporal stores are supported by AVX (the equivalent
6372 // loads require AVX2).
6373 if (DataSize == 32)
6374 return ST->hasAVX();
6375 if (DataSize == 16)
6376 return ST->hasSSE1();
6377 return true;
6378}
6379
6381 ElementCount NumElements) const {
6382 // movddup
6383 return ST->hasSSE3() && !NumElements.isScalable() &&
6384 NumElements.getFixedValue() == 2 &&
6385 ElementTy == Type::getDoubleTy(ElementTy->getContext());
6386}
6387
6388bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const {
6389 if (!isa<VectorType>(DataTy))
6390 return false;
6391
6392 if (!ST->hasAVX512())
6393 return false;
6394
6395 // The backend can't handle a single element vector.
6396 if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
6397 return false;
6398
6399 Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
6400
6401 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6402 return true;
6403
6404 if (!ScalarTy->isIntegerTy())
6405 return false;
6406
6407 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6408 return IntWidth == 32 || IntWidth == 64 ||
6409 ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
6410}
6411
6413 Align Alignment) const {
6414 return isLegalMaskedExpandLoad(DataTy, Alignment);
6415}
6416
6417bool X86TTIImpl::supportsGather() const {
6418 // Some CPUs have better gather performance than others.
6419 // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
6420 // enable gather with a -march.
6421 return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
6422}
6423
6425 Align Alignment) const {
6426 // Gather / Scatter for vector 2 is not profitable on KNL / SKX
6427 // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
6428 // it to 8 elements, but zeroing upper bits of the mask vector will add more
6429 // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
6430 // Check, maybe the gather/scatter instruction is better in the VariableMask
6431 // case.
6432 unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
6433 return NumElts == 1 ||
6434 (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
6435}
6436
6438 Align Alignment) const {
6439 Type *ScalarTy = DataTy->getScalarType();
6440 if (ScalarTy->isPointerTy())
6441 return true;
6442
6443 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
6444 return true;
6445
6446 if (!ScalarTy->isIntegerTy())
6447 return false;
6448
6449 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
6450 return IntWidth == 32 || IntWidth == 64;
6451}
6452
6453bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) const {
6454 if (!supportsGather() || !ST->preferGather())
6455 return false;
6456 return isLegalMaskedGatherScatter(DataTy, Alignment);
6457}
6458
6459bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
6460 unsigned Opcode1,
6461 const SmallBitVector &OpcodeMask) const {
6462 // ADDSUBPS 4xf32 SSE3
6463 // VADDSUBPS 4xf32 AVX
6464 // VADDSUBPS 8xf32 AVX2
6465 // ADDSUBPD 2xf64 SSE3
6466 // VADDSUBPD 2xf64 AVX
6467 // VADDSUBPD 4xf64 AVX2
6468
6469 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
6470 assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
6471 if (!isPowerOf2_32(NumElements))
6472 return false;
6473 // Check the opcode pattern. We apply the mask on the opcode arguments and
6474 // then check if it is what we expect.
6475 for (int Lane : seq<int>(0, NumElements)) {
6476 unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
6477 // We expect FSub for even lanes and FAdd for odd lanes.
6478 if (Lane % 2 == 0 && Opc != Instruction::FSub)
6479 return false;
6480 if (Lane % 2 == 1 && Opc != Instruction::FAdd)
6481 return false;
6482 }
6483 // Now check that the pattern is supported by the target ISA.
6484 Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
6485 if (ElemTy->isFloatTy())
6486 return ST->hasSSE3() && NumElements % 4 == 0;
6487 if (ElemTy->isDoubleTy())
6488 return ST->hasSSE3() && NumElements % 2 == 0;
6489 return false;
6490}
6491
6492bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) const {
6493 // AVX2 doesn't support scatter
6494 if (!ST->hasAVX512() || !ST->preferScatter())
6495 return false;
6496 return isLegalMaskedGatherScatter(DataType, Alignment);
6497}
6498
6499bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const {
6500 EVT VT = TLI->getValueType(DL, DataType);
6501 return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
6502}
6503
6505 // FDIV is always expensive, even if it has a very low uop count.
6506 // TODO: Still necessary for recent CPUs with low latency/throughput fdiv?
6507 if (I->getOpcode() == Instruction::FDiv)
6508 return true;
6509
6511}
6512
6513bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return false; }
6514
6516 const Function *Callee) const {
6517 const TargetMachine &TM = getTLI()->getTargetMachine();
6518
6519 // Work this as a subsetting of subtarget features.
6520 const FeatureBitset &CallerBits =
6521 TM.getSubtargetImpl(*Caller)->getFeatureBits();
6522 const FeatureBitset &CalleeBits =
6523 TM.getSubtargetImpl(*Callee)->getFeatureBits();
6524
6525 // Check whether features are the same (apart from the ignore list).
6526 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
6527 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
6528 if (RealCallerBits == RealCalleeBits)
6529 return true;
6530
6531 // If the features are a subset, we need to additionally check for calls
6532 // that may become ABI-incompatible as a result of inlining.
6533 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
6534 return false;
6535
6536 for (const Instruction &I : instructions(Callee)) {
6537 if (const auto *CB = dyn_cast<CallBase>(&I)) {
6538 // Having more target features is fine for inline ASM and intrinsics.
6539 if (CB->isInlineAsm() || CB->getIntrinsicID() != Intrinsic::not_intrinsic)
6540 continue;
6541
6543 for (Value *Arg : CB->args())
6544 Types.push_back(Arg->getType());
6545 if (!CB->getType()->isVoidTy())
6546 Types.push_back(CB->getType());
6547
6548 // Simple types are always ABI compatible.
6549 auto IsSimpleTy = [](Type *Ty) {
6550 return !Ty->isVectorTy() && !Ty->isAggregateType();
6551 };
6552 if (all_of(Types, IsSimpleTy))
6553 continue;
6554
6555 // Do a precise compatibility check.
6556 if (!areTypesABICompatible(Caller, Callee, Types))
6557 return false;
6558 }
6559 }
6560 return true;
6561}
6562
6564 const Function *Callee,
6565 const ArrayRef<Type *> &Types) const {
6566 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
6567 return false;
6568
6569 // If we get here, we know the target features match. If one function
6570 // considers 512-bit vectors legal and the other does not, consider them
6571 // incompatible.
6572 const TargetMachine &TM = getTLI()->getTargetMachine();
6573
6574 if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
6575 TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
6576 return true;
6577
6578 // Consider the arguments compatible if they aren't vectors or aggregates.
6579 // FIXME: Look at the size of vectors.
6580 // FIXME: Look at the element types of aggregates to see if there are vectors.
6581 return llvm::none_of(Types,
6582 [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
6583}
6584
6586X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
6588 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
6589 Options.NumLoadsPerBlock = 2;
6590 // All GPR and vector loads can be unaligned.
6591 Options.AllowOverlappingLoads = true;
6592 if (IsZeroCmp) {
6593 // Only enable vector loads for equality comparison. Right now the vector
6594 // version is not as fast for three way compare (see #33329).
6595 const unsigned PreferredWidth = ST->getPreferVectorWidth();
6596 if (PreferredWidth >= 512 && ST->hasAVX512())
6597 Options.LoadSizes.push_back(64);
6598 if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
6599 if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
6600 }
6601 if (ST->is64Bit()) {
6602 Options.LoadSizes.push_back(8);
6603 }
6604 Options.LoadSizes.push_back(4);
6605 Options.LoadSizes.push_back(2);
6606 Options.LoadSizes.push_back(1);
6607 return Options;
6608}
6609
6611 return supportsGather();
6612}
6613
6615 return false;
6616}
6617
6619 // TODO: We expect this to be beneficial regardless of arch,
6620 // but there are currently some unexplained performance artifacts on Atom.
6621 // As a temporary solution, disable on Atom.
6622 return !(ST->isAtom());
6623}
6624
6625// Get estimation for interleaved load/store operations and strided load.
6626// \p Indices contains indices for strided load.
6627// \p Factor - the factor of interleaving.
6628// AVX-512 provides 3-src shuffles that significantly reduces the cost.
6630 unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
6631 ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
6632 TTI::TargetCostKind CostKind, bool UseMaskForCond,
6633 bool UseMaskForGaps) const {
6634 // VecTy for interleave memop is <VF*Factor x Elt>.
6635 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6636 // VecTy = <12 x i32>.
6637
6638 // Calculate the number of memory operations (NumOfMemOps), required
6639 // for load/store the VecTy.
6640 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6641 unsigned VecTySize = DL.getTypeStoreSize(VecTy);
6642 unsigned LegalVTSize = LegalVT.getStoreSize();
6643 unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
6644
6645 // Get the cost of one memory operation.
6646 auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
6647 LegalVT.getVectorNumElements());
6648 InstructionCost MemOpCost;
6649 bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
6650 if (UseMaskedMemOp)
6651 MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
6653 else
6654 MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace,
6655 CostKind);
6656
6657 unsigned VF = VecTy->getNumElements() / Factor;
6658 MVT VT =
6659 MVT::getVectorVT(TLI->getSimpleValueType(DL, VecTy->getScalarType()), VF);
6660
6661 InstructionCost MaskCost;
6662 if (UseMaskedMemOp) {
6663 APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
6664 for (unsigned Index : Indices) {
6665 assert(Index < Factor && "Invalid index for interleaved memory op");
6666 for (unsigned Elm = 0; Elm < VF; Elm++)
6667 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
6668 }
6669
6670 Type *I1Type = Type::getInt1Ty(VecTy->getContext());
6671
6672 MaskCost = getReplicationShuffleCost(
6673 I1Type, Factor, VF,
6674 UseMaskForGaps ? DemandedLoadStoreElts
6676 CostKind);
6677
6678 // The Gaps mask is invariant and created outside the loop, therefore the
6679 // cost of creating it is not accounted for here. However if we have both
6680 // a MaskForGaps and some other mask that guards the execution of the
6681 // memory access, we need to account for the cost of And-ing the two masks
6682 // inside the loop.
6683 if (UseMaskForGaps) {
6684 auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
6685 MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
6686 }
6687 }
6688
6689 if (Opcode == Instruction::Load) {
6690 // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
6691 // contain the cost of the optimized shuffle sequence that the
6692 // X86InterleavedAccess pass will generate.
6693 // The cost of loads and stores are computed separately from the table.
6694
6695 // X86InterleavedAccess support only the following interleaved-access group.
6696 static const CostTblEntry AVX512InterleavedLoadTbl[] = {
6697 {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
6698 {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
6699 {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
6700 };
6701
6702 if (const auto *Entry =
6703 CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
6704 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6705 //If an entry does not exist, fallback to the default implementation.
6706
6707 // Kind of shuffle depends on number of loaded values.
6708 // If we load the entire data in one register, we can use a 1-src shuffle.
6709 // Otherwise, we'll merge 2 sources in each operation.
6710 TTI::ShuffleKind ShuffleKind =
6711 (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
6712
6713 InstructionCost ShuffleCost = getShuffleCost(
6714 ShuffleKind, SingleMemOpTy, SingleMemOpTy, {}, CostKind, 0, nullptr);
6715
6716 unsigned NumOfLoadsInInterleaveGrp =
6717 Indices.size() ? Indices.size() : Factor;
6718 auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
6719 VecTy->getNumElements() / Factor);
6720 InstructionCost NumOfResults =
6721 getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp;
6722
6723 // About a half of the loads may be folded in shuffles when we have only
6724 // one result. If we have more than one result, or the loads are masked,
6725 // we do not fold loads at all.
6726 unsigned NumOfUnfoldedLoads =
6727 UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
6728
6729 // Get a number of shuffle operations per result.
6730 unsigned NumOfShufflesPerResult =
6731 std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
6732
6733 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6734 // When we have more than one destination, we need additional instructions
6735 // to keep sources.
6736 InstructionCost NumOfMoves = 0;
6737 if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
6738 NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
6739
6740 InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
6741 MaskCost + NumOfUnfoldedLoads * MemOpCost +
6742 NumOfMoves;
6743
6744 return Cost;
6745 }
6746
6747 // Store.
6748 assert(Opcode == Instruction::Store &&
6749 "Expected Store Instruction at this point");
6750 // X86InterleavedAccess support only the following interleaved-access group.
6751 static const CostTblEntry AVX512InterleavedStoreTbl[] = {
6752 {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
6753 {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
6754 {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
6755
6756 {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
6757 {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
6758 {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
6759 {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
6760 };
6761
6762 if (const auto *Entry =
6763 CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
6764 return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
6765 //If an entry does not exist, fallback to the default implementation.
6766
6767 // There is no strided stores meanwhile. And store can't be folded in
6768 // shuffle.
6769 unsigned NumOfSources = Factor; // The number of values to be merged.
6770 InstructionCost ShuffleCost =
6771 getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, SingleMemOpTy, {},
6772 CostKind, 0, nullptr);
6773 unsigned NumOfShufflesPerStore = NumOfSources - 1;
6774
6775 // The SK_MergeTwoSrc shuffle clobbers one of src operands.
6776 // We need additional instructions to keep sources.
6777 unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
6779 MaskCost +
6780 NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
6781 NumOfMoves;
6782 return Cost;
6783}
6784
6786 unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
6787 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
6788 bool UseMaskForCond, bool UseMaskForGaps) const {
6789 auto *VecTy = cast<FixedVectorType>(BaseTy);
6790
6791 auto isSupportedOnAVX512 = [&](Type *VecTy) {
6792 Type *EltTy = cast<VectorType>(VecTy)->getElementType();
6793 if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
6794 EltTy->isIntegerTy(32) || EltTy->isPointerTy())
6795 return true;
6796 if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
6797 return ST->hasBWI();
6798 if (EltTy->isBFloatTy())
6799 return ST->hasBF16();
6800 return false;
6801 };
6802 if (ST->hasAVX512() && isSupportedOnAVX512(VecTy))
6804 Opcode, VecTy, Factor, Indices, Alignment,
6805 AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
6806
6807 if (UseMaskForCond || UseMaskForGaps)
6808 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6809 Alignment, AddressSpace, CostKind,
6810 UseMaskForCond, UseMaskForGaps);
6811
6812 // Get estimation for interleaved load/store operations for SSE-AVX2.
6813 // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
6814 // computing the cost using a generic formula as a function of generic
6815 // shuffles. We therefore use a lookup table instead, filled according to
6816 // the instruction sequences that codegen currently generates.
6817
6818 // VecTy for interleave memop is <VF*Factor x Elt>.
6819 // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
6820 // VecTy = <12 x i32>.
6821 MVT LegalVT = getTypeLegalizationCost(VecTy).second;
6822
6823 // This function can be called with VecTy=<6xi128>, Factor=3, in which case
6824 // the VF=2, while v2i128 is an unsupported MVT vector type
6825 // (see MachineValueType.h::getVectorVT()).
6826 if (!LegalVT.isVector())
6827 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6828 Alignment, AddressSpace, CostKind);
6829
6830 unsigned VF = VecTy->getNumElements() / Factor;
6831 Type *ScalarTy = VecTy->getElementType();
6832 // Deduplicate entries, model floats/pointers as appropriately-sized integers.
6833 if (!ScalarTy->isIntegerTy())
6834 ScalarTy =
6835 Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
6836
6837 // Get the cost of all the memory operations.
6838 // FIXME: discount dead loads.
6839 InstructionCost MemOpCosts =
6840 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
6841
6842 auto *VT = FixedVectorType::get(ScalarTy, VF);
6843 EVT ETy = TLI->getValueType(DL, VT);
6844 if (!ETy.isSimple())
6845 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
6846 Alignment, AddressSpace, CostKind);
6847
6848 // TODO: Complete for other data-types and strides.
6849 // Each combination of Stride, element bit width and VF results in a different
6850 // sequence; The cost tables are therefore accessed with:
6851 // Factor (stride) and VectorType=VFxiN.
6852 // The Cost accounts only for the shuffle sequence;
6853 // The cost of the loads/stores is accounted for separately.
6854 //
6855 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
6856 {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
6857 {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
6858 {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
6859 {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
6860 {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
6861
6862 {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
6863 {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
6864 {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
6865
6866 {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
6867 {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
6868 {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
6869
6870 {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
6871 {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
6872 {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
6873 {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
6874
6875 {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
6876 {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
6877 {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
6878 {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
6879 {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
6880
6881 {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
6882 {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
6883 {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
6884 {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
6885 {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
6886
6887 {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
6888 {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
6889 {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
6890 {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
6891 {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
6892
6893 {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
6894 {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
6895 {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
6896 {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
6897
6898 {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
6899 {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
6900 {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
6901 {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
6902 {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
6903
6904 {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
6905 {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
6906 {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
6907 {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
6908 {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
6909
6910 {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
6911 {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
6912 {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
6913 {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
6914 {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
6915
6916 {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
6917 {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
6918 {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
6919 {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
6920
6921 {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
6922 {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
6923 {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
6924 {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
6925 {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
6926
6927 {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
6928 {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
6929 {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
6930 {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
6931 {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
6932
6933 {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
6934 {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
6935 {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
6936 {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
6937
6938 {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
6939 {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
6940 {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
6941
6942 {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
6943 };
6944
6945 static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
6946 {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
6947 };
6948
6949 static const CostTblEntry SSE2InterleavedLoadTbl[] = {
6950 {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
6951 {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
6952
6953 {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
6954 {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
6955
6956 {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
6957 };
6958
6959 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
6960 {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
6961 {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
6962
6963 {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
6964 {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
6965 {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
6966
6967 {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
6968 {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
6969 {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
6970 {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
6971
6972 {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
6973 {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
6974 {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
6975 {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
6976 {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
6977
6978 {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
6979 {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
6980 {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
6981 {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
6982 {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
6983
6984 {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
6985 {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
6986 {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
6987 {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
6988 {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
6989
6990 {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
6991 {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
6992 {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
6993 {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
6994 {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
6995
6996 {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
6997 {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
6998 {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
6999 {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
7000
7001 {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
7002 {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
7003 {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
7004 {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
7005 {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
7006
7007 {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
7008 {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
7009 {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
7010 {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
7011 {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
7012
7013 {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
7014 {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
7015 {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
7016 {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
7017 {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
7018
7019 {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
7020 {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
7021 {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
7022 {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
7023
7024 {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
7025 {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
7026 {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
7027 {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
7028 {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
7029
7030 {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
7031 {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
7032 {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
7033 {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
7034 {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
7035
7036 {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
7037 {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
7038 {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
7039 {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
7040
7041 {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
7042 {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
7043 {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
7044 };
7045
7046 static const CostTblEntry SSE2InterleavedStoreTbl[] = {
7047 {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
7048 {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
7049 {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
7050
7051 {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
7052 {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
7053
7054 {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
7055 };
7056
7057 if (Opcode == Instruction::Load) {
7058 auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
7059 MemOpCosts](const CostTblEntry *Entry) {
7060 // NOTE: this is just an approximation!
7061 // It can over/under -estimate the cost!
7062 return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
7063 };
7064
7065 if (ST->hasAVX2())
7066 if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
7067 ETy.getSimpleVT()))
7068 return GetDiscountedCost(Entry);
7069
7070 if (ST->hasSSSE3())
7071 if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
7072 ETy.getSimpleVT()))
7073 return GetDiscountedCost(Entry);
7074
7075 if (ST->hasSSE2())
7076 if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
7077 ETy.getSimpleVT()))
7078 return GetDiscountedCost(Entry);
7079 } else {
7080 assert(Opcode == Instruction::Store &&
7081 "Expected Store Instruction at this point");
7082 assert((!Indices.size() || Indices.size() == Factor) &&
7083 "Interleaved store only supports fully-interleaved groups.");
7084 if (ST->hasAVX2())
7085 if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
7086 ETy.getSimpleVT()))
7087 return MemOpCosts + Entry->Cost;
7088
7089 if (ST->hasSSE2())
7090 if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
7091 ETy.getSimpleVT()))
7092 return MemOpCosts + Entry->Cost;
7093 }
7094
7095 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
7096 Alignment, AddressSpace, CostKind,
7097 UseMaskForCond, UseMaskForGaps);
7098}
7099
7101 StackOffset BaseOffset,
7102 bool HasBaseReg, int64_t Scale,
7103 unsigned AddrSpace) const {
7104 // Scaling factors are not free at all.
7105 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
7106 // will take 2 allocations in the out of order engine instead of 1
7107 // for plain addressing mode, i.e. inst (reg1).
7108 // E.g.,
7109 // vaddps (%rsi,%rdx), %ymm0, %ymm1
7110 // Requires two allocations (one for the load, one for the computation)
7111 // whereas:
7112 // vaddps (%rsi), %ymm0, %ymm1
7113 // Requires just 1 allocation, i.e., freeing allocations for other operations
7114 // and having less micro operations to execute.
7115 //
7116 // For some X86 architectures, this is even worse because for instance for
7117 // stores, the complex addressing mode forces the instruction to use the
7118 // "load" ports instead of the dedicated "store" port.
7119 // E.g., on Haswell:
7120 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
7121 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
7123 AM.BaseGV = BaseGV;
7124 AM.BaseOffs = BaseOffset.getFixed();
7125 AM.HasBaseReg = HasBaseReg;
7126 AM.Scale = Scale;
7127 AM.ScalableOffset = BaseOffset.getScalable();
7128 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
7129 // Scale represents reg2 * scale, thus account for 1
7130 // as soon as we use a second register.
7131 return AM.Scale != 0;
7133}
7134
7136 // TODO: Hook MispredictPenalty of SchedMachineModel into this.
7137 return 14;
7138}
7139
7141 unsigned Bits = Ty->getScalarSizeInBits();
7142
7143 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
7144 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
7145 if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
7146 return false;
7147
7148 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
7149 // shifts just as cheap as scalar ones.
7150 if (ST->hasAVX2() && (Bits == 32 || Bits == 64))
7151 return false;
7152
7153 // AVX512BW has shifts such as vpsllvw.
7154 if (ST->hasBWI() && Bits == 16)
7155 return false;
7156
7157 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
7158 // fully general vector.
7159 return true;
7160}
7161
7162unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
7163 Type *ScalarValTy) const {
7164 if (ST->hasF16C() && ScalarMemTy->isHalfTy()) {
7165 return 4;
7166 }
7167 return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
7168}
7169
7171 SmallVectorImpl<Use *> &Ops) const {
7172 using namespace llvm::PatternMatch;
7173
7174 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
7175 if (!VTy)
7176 return false;
7177
7178 if (I->getOpcode() == Instruction::Mul &&
7179 VTy->getElementType()->isIntegerTy(64)) {
7180 for (auto &Op : I->operands()) {
7181 // Make sure we are not already sinking this operand
7182 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7183 continue;
7184
7185 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
7186 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
7187 if (ST->hasSSE41() &&
7188 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
7189 m_SpecificInt(32)))) {
7190 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7191 Ops.push_back(&Op);
7192 } else if (ST->hasSSE2() &&
7193 match(Op.get(),
7194 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
7195 Ops.push_back(&Op);
7196 }
7197 }
7198
7199 return !Ops.empty();
7200 }
7201
7202 // A uniform shift amount in a vector shift or funnel shift may be much
7203 // cheaper than a generic variable vector shift, so make that pattern visible
7204 // to SDAG by sinking the shuffle instruction next to the shift.
7205 int ShiftAmountOpNum = -1;
7206 if (I->isShift())
7207 ShiftAmountOpNum = 1;
7208 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
7209 if (II->getIntrinsicID() == Intrinsic::fshl ||
7210 II->getIntrinsicID() == Intrinsic::fshr)
7211 ShiftAmountOpNum = 2;
7212 }
7213
7214 if (ShiftAmountOpNum == -1)
7215 return false;
7216
7217 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
7218 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
7219 isVectorShiftByScalarCheap(I->getType())) {
7220 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
7221 return true;
7222 }
7223
7224 return false;
7225}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define I(x, y, z)
Definition MD5.cpp:58
#define T
uint64_t IntrinsicInst * II
#define P(N)
static unsigned getNumElements(Type *Ty)
This file implements the SmallBitVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
CostTblEntryT< CostKindCosts > CostKindTblEntry
static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST)
TypeConversionCostTblEntryT< CostKindCosts > TypeConversionCostKindTblEntry
This file a TargetTransformInfoImplBase conforming object specific to the X86 target machine.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const override
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:681
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:708
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:702
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:701
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:686
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:689
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:703
@ ICMP_NE
not equal
Definition InstrTypes.h:700
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:706
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:704
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:688
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Class to represent pointers.
unsigned getAddressSpace() const
Return the address space of the Pointer type.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
bool test(unsigned Idx) const
size_type size() const
Returns the number of bits in this bitvector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:31
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:41
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:40
Primary interface to the complete machine description for the target machine.
virtual InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const
bool isStridedAccess(const SCEV *Ptr) const
unsigned minRequiredElementSize(const Value *Val, bool &isSigned) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
const SCEVConstant * getConstantStrideStep(ScalarEvolution *SE, const SCEV *Ptr) const
virtual bool isExpensiveToSpeculativelyExecute(const Instruction *I) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
CacheLevel
The possible cache levels.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
LLVM_ABI unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:62
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:286
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition Type.h:381
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:285
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
Base class of all SIMD vector types.
static VectorType * getExtendedElementVectorType(VectorType *VTy)
This static method is like getInteger except that the element types are twice as wide as the elements...
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getDoubleElementsVectorType(VectorType *VTy)
This static method returns a VectorType with twice as many elements as the input type and the same el...
Type * getElementType() const
bool useAVX512Regs() const
bool hasAVX512() const
bool hasAVX2() const
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) const override
bool isLegalNTLoad(Type *DataType, Align Alignment) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace) const override
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
unsigned getRegisterClassForType(bool Vector, Type *Ty) const override
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalNTStore(Type *DataType, Align Alignment) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCostAVX512(unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isVectorShiftByScalarCheap(Type *Ty) const override
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Type) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
unsigned getAtomicMemIntrinsicMaxElementSize() const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) const override
InstructionCost getBranchMispredictPenalty() const override
bool isExpensiveToSpeculativelyExecute(const Instruction *I) const override
bool hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const override
std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment) const
bool enableInterleavedAccessVectorization() const override
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
Calculate the cost of Gather / Scatter operation.
unsigned getNumberOfRegisters(unsigned ClassID) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddressSpace) const override
bool hasDivRemOp(Type *DataType, bool IsSigned) const override
bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool supportsEfficientVectorElementLoadStore() const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
InstructionCost getIntImmCost(int64_t) const
Calculate the cost of materializing a 64-bit value.
InstructionCost getMinMaxCost(Intrinsic::ID IID, Type *Ty, TTI::TargetCostKind CostKind, FastMathFlags FMF) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool canMacroFuseCmp() const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
bool prefersVectorizedAddressing() const override
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const override
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3009
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
apint_match m_APIntAllowPoison(const APInt *&Res)
Match APInt while allowing poison in splat vector constants.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
FunctionAddr VTableAddr uintptr_t uintptr_t DataSize
Definition InstrProf.h:267
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:551
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:682
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:390
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1741
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:399
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1837
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1963
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
#define N
std::optional< unsigned > operator[](TargetTransformInfo::TargetCostKind Kind) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Cost Table Entry.
Definition CostTable.h:25
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:368
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:311
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:318
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:235
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Type Conversion Cost Table.
Definition CostTable.h:55